gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 121   SLP_TREE_CODE (this) = ERROR_MARK;
 122   SLP_TREE_VECTYPE (this) = NULL_TREE;
 123   SLP_TREE_REPRESENTATIVE (this) = NULL;
 124   SLP_TREE_REF_COUNT (this) = 1;
 125   this->failed = NULL;
 126   this->max_nunits = 1;
 127   this->lanes = 0;
 128 }
 129
 130 /* Tear down a SLP node.  */
 131
 132 _slp_tree::~_slp_tree ()
 133 {
 134   if (this->prev_node)
 135     this->prev_node->next_node = this->next_node;
 136   else
 137     slp_first_node = this->next_node;
 138   if (this->next_node)
 139     this->next_node->prev_node = this->prev_node;
 140   SLP_TREE_CHILDREN (this).release ();
 141   SLP_TREE_SCALAR_STMTS (this).release ();
 142   SLP_TREE_SCALAR_OPS (this).release ();
 143   SLP_TREE_VEC_DEFS (this).release ();
 144   SLP_TREE_LOAD_PERMUTATION (this).release ();
 145   SLP_TREE_LANE_PERMUTATION (this).release ();
 146   if (this->failed)
 147     free (failed);
 148 }
 149
 150 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 151
 152 void
 153 _slp_tree::push_vec_def (gimple *def)
 154 {
 155   if (gphi *phi = dyn_cast <gphi *> (def))
 156     vec_defs.quick_push (gimple_phi_result (phi));
 157   else
 158     {
 159       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 160       vec_defs.quick_push (get_def_from_ptr (defop));
 161     }
 162 }
 163
 164 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 165
 166 void
 167 vect_free_slp_tree (slp_tree node)
 168 {
 169   int i;
 170   slp_tree child;
 171
 172   if (--SLP_TREE_REF_COUNT (node) != 0)
 173     return;
 174
 175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 176     if (child)
 177       vect_free_slp_tree (child);
 178
 179   /* If the node defines any SLP only patterns then those patterns are no
 180      longer valid and should be removed.  */
 181   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 182   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 183     {
 184       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 185       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 186       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 187     }
 188
 189   delete node;
 190 }
 191
 192 /* Return a location suitable for dumpings related to the SLP instance.  */
 193
 194 dump_user_location_t
 195 _slp_instance::location () const
 196 {
 197   if (!root_stmts.is_empty ())
 198     return root_stmts[0]->stmt;
 199   else
 200     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 201 }
 202
 203
 204 /* Free the memory allocated for the SLP instance.  */
 205
 206 void
 207 vect_free_slp_instance (slp_instance instance)
 208 {
 209   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 210   SLP_INSTANCE_LOADS (instance).release ();
 211   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 212   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 213   instance->subgraph_entries.release ();
 214   instance->cost_vec.release ();
 215   free (instance);
 216 }
 217
 218
 219 /* Create an SLP node for SCALAR_STMTS.  */
 220
 221 slp_tree
 222 vect_create_new_slp_node (unsigned nops, tree_code code)
 223 {
 224   slp_tree node = new _slp_tree;
 225   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 226   SLP_TREE_CHILDREN (node).create (nops);
 227   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 228   SLP_TREE_CODE (node) = code;
 229   return node;
 230 }
 231 /* Create an SLP node for SCALAR_STMTS.  */
 232
 233 static slp_tree
 234 vect_create_new_slp_node (slp_tree node,
 235                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 238   SLP_TREE_CHILDREN (node).create (nops);
 239   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 240   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 241   SLP_TREE_LANES (node) = scalar_stmts.length ();
 242   return node;
 243 }
 244
 245 /* Create an SLP node for SCALAR_STMTS.  */
 246
 247 static slp_tree
 248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 249 {
 250   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 251 }
 252
 253 /* Create an SLP node for OPS.  */
 254
 255 static slp_tree
 256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 257 {
 258   SLP_TREE_SCALAR_OPS (node) = ops;
 259   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 260   SLP_TREE_LANES (node) = ops.length ();
 261   return node;
 262 }
 263
 264 /* Create an SLP node for OPS.  */
 265
 266 static slp_tree
 267 vect_create_new_slp_node (vec<tree> ops)
 268 {
 269   return vect_create_new_slp_node (new _slp_tree, ops);
 270 }
 271
 272
 273 /* This structure is used in creation of an SLP tree.  Each instance
 274    corresponds to the same operand in a group of scalar stmts in an SLP
 275    node.  */
 276 typedef struct _slp_oprnd_info
 277 {
 278   /* Def-stmts for the operands.  */
 279   vec<stmt_vec_info> def_stmts;
 280   /* Operands.  */
 281   vec<tree> ops;
 282   /* Information about the first statement, its vector def-type, type, the
 283      operand itself in case it's constant, and an indication if it's a pattern
 284      stmt.  */
 285   tree first_op_type;
 286   enum vect_def_type first_dt;
 287   bool any_pattern;
 288 } *slp_oprnd_info;
 289
 290
 291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 292    operand.  */
 293 static vec<slp_oprnd_info>
 294 vect_create_oprnd_info (int nops, int group_size)
 295 {
 296   int i;
 297   slp_oprnd_info oprnd_info;
 298   vec<slp_oprnd_info> oprnds_info;
 299
 300   oprnds_info.create (nops);
 301   for (i = 0; i < nops; i++)
 302     {
 303       oprnd_info = XNEW (struct _slp_oprnd_info);
 304       oprnd_info->def_stmts.create (group_size);
 305       oprnd_info->ops.create (group_size);
 306       oprnd_info->first_dt = vect_uninitialized_def;
 307       oprnd_info->first_op_type = NULL_TREE;
 308       oprnd_info->any_pattern = false;
 309       oprnds_info.quick_push (oprnd_info);
 310     }
 311
 312   return oprnds_info;
 313 }
 314
 315
 316 /* Free operands info.  */
 317
 318 static void
 319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 320 {
 321   int i;
 322   slp_oprnd_info oprnd_info;
 323
 324   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 325     {
 326       oprnd_info->def_stmts.release ();
 327       oprnd_info->ops.release ();
 328       XDELETE (oprnd_info);
 329     }
 330
 331   oprnds_info.release ();
 332 }
 333
 334 /* Return the execution frequency of NODE (so that a higher value indicates
 335    a "more important" node when optimizing for speed).  */
 336
 337 static sreal
 338 vect_slp_node_weight (slp_tree node)
 339 {
 340   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 341   basic_block bb = gimple_bb (stmt_info->stmt);
 342   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 343 }
 344
 345 /* Return true if STMTS contains a pattern statement.  */
 346
 347 static bool
 348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 349 {
 350   stmt_vec_info stmt_info;
 351   unsigned int i;
 352   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 353     if (is_pattern_stmt_p (stmt_info))
 354       return true;
 355   return false;
 356 }
 357
 358 /* Return true when all lanes in the external or constant NODE have
 359    the same value.  */
 360
 361 static bool
 362 vect_slp_tree_uniform_p (slp_tree node)
 363 {
 364   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 365               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 366
 367   /* Pre-exsting vectors.  */
 368   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 369     return false;
 370
 371   unsigned i;
 372   tree op, first = NULL_TREE;
 373   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 374     if (!first)
 375       first = op;
 376     else if (!operand_equal_p (first, op, 0))
 377       return false;
 378
 379   return true;
 380 }
 381
 382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 383    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 384    of the chain.  */
 385
 386 int
 387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 388                                       stmt_vec_info first_stmt_info)
 389 {
 390   stmt_vec_info next_stmt_info = first_stmt_info;
 391   int result = 0;
 392
 393   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 394     return -1;
 395
 396   do
 397     {
 398       if (next_stmt_info == stmt_info)
 399         return result;
 400       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 401       if (next_stmt_info)
 402         result += DR_GROUP_GAP (next_stmt_info);
 403     }
 404   while (next_stmt_info);
 405
 406   return -1;
 407 }
 408
 409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 410    using the method implemented by duplicate_and_interleave.  Return true
 411    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 412    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 413    (if nonnull).  */
 414
 415 bool
 416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 417                                 tree elt_type, unsigned int *nvectors_out,
 418                                 tree *vector_type_out,
 419                                 tree *permutes)
 420 {
 421   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 422   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 423     return false;
 424
 425   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 426   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 427   unsigned int nvectors = 1;
 428   for (;;)
 429     {
 430       scalar_int_mode int_mode;
 431       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 432       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 433         {
 434           /* Get the natural vector type for this SLP group size.  */
 435           tree int_type = build_nonstandard_integer_type
 436             (GET_MODE_BITSIZE (int_mode), 1);
 437           tree vector_type
 438             = get_vectype_for_scalar_type (vinfo, int_type, count);
 439           poly_int64 half_nelts;
 440           if (vector_type
 441               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 442               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 443                            GET_MODE_SIZE (base_vector_mode))
 444               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 445                              2, &half_nelts))
 446             {
 447               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 448                  together into elements of type INT_TYPE and using the result
 449                  to build NVECTORS vectors.  */
 450               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 451               vec_perm_builder sel1 (nelts, 2, 3);
 452               vec_perm_builder sel2 (nelts, 2, 3);
 453
 454               for (unsigned int i = 0; i < 3; ++i)
 455                 {
 456                   sel1.quick_push (i);
 457                   sel1.quick_push (i + nelts);
 458                   sel2.quick_push (half_nelts + i);
 459                   sel2.quick_push (half_nelts + i + nelts);
 460                 }
 461               vec_perm_indices indices1 (sel1, 2, nelts);
 462               vec_perm_indices indices2 (sel2, 2, nelts);
 463               machine_mode vmode = TYPE_MODE (vector_type);
 464               if (can_vec_perm_const_p (vmode, vmode, indices1)
 465                   && can_vec_perm_const_p (vmode, vmode, indices2))
 466                 {
 467                   if (nvectors_out)
 468                     *nvectors_out = nvectors;
 469                   if (vector_type_out)
 470                     *vector_type_out = vector_type;
 471                   if (permutes)
 472                     {
 473                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 474                                                                 indices1);
 475                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 476                                                                 indices2);
 477                     }
 478                   return true;
 479                 }
 480             }
 481         }
 482       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 483         return false;
 484       nvectors *= 2;
 485     }
 486 }
 487
 488 /* Return true if DTA and DTB match.  */
 489
 490 static bool
 491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 492 {
 493   return (dta == dtb
 494           || ((dta == vect_external_def || dta == vect_constant_def)
 495               && (dtb == vect_external_def || dtb == vect_constant_def)));
 496 }
 497
 498 static const int cond_expr_maps[3][5] = {
 499   { 4, -1, -2, 1, 2 },
 500   { 4, -2, -1, 1, 2 },
 501   { 4, -1, -2, 2, 1 }
 502 };
 503 static const int arg1_map[] = { 1, 1 };
 504 static const int arg2_map[] = { 1, 2 };
 505 static const int arg1_arg4_map[] = { 2, 1, 4 };
 506 static const int arg3_arg2_map[] = { 2, 3, 2 };
 507 static const int op1_op0_map[] = { 2, 1, 0 };
 508
 509 /* For most SLP statements, there is a one-to-one mapping between
 510    gimple arguments and child nodes.  If that is not true for STMT,
 511    return an array that contains:
 512
 513    - the number of child nodes, followed by
 514    - for each child node, the index of the argument associated with that node.
 515      The special index -1 is the first operand of an embedded comparison and
 516      the special index -2 is the second operand of an embedded comparison.
 517
 518    SWAP is as for vect_get_and_check_slp_defs.  */
 519
 520 static const int *
 521 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 522 {
 523   if (auto assign = dyn_cast<const gassign *> (stmt))
 524     {
 525       if (gimple_assign_rhs_code (assign) == COND_EXPR
 526           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 527         return cond_expr_maps[swap];
 528       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 529           && swap)
 530         return op1_op0_map;
 531     }
 532   gcc_assert (!swap);
 533   if (auto call = dyn_cast<const gcall *> (stmt))
 534     {
 535       if (gimple_call_internal_p (call))
 536         switch (gimple_call_internal_fn (call))
 537           {
 538           case IFN_MASK_LOAD:
 539             return arg2_map;
 540
 541           case IFN_GATHER_LOAD:
 542             return arg1_map;
 543
 544           case IFN_MASK_GATHER_LOAD:
 545             return arg1_arg4_map;
 546
 547           case IFN_MASK_STORE:
 548             return arg3_arg2_map;
 549
 550           default:
 551             break;
 552           }
 553     }
 554   return nullptr;
 555 }
 556
 557 /* Return the SLP node child index for operand OP of STMT.  */
 558
 559 int
 560 vect_slp_child_index_for_operand (const gimple *stmt, int op)
 561 {
 562   const int *opmap = vect_get_operand_map (stmt);
 563   if (!opmap)
 564     return op;
 565   for (int i = 1; i < 1 + opmap[0]; ++i)
 566     if (opmap[i] == op)
 567       return i - 1;
 568   gcc_unreachable ();
 569 }
 570
 571 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 572    they are of a valid type and that they match the defs of the first stmt of
 573    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 574    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 575    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 576    is 1 if STMT is cond and operands of comparison need to be swapped;
 577    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 578
 579    If there was a fatal error return -1; if the error could be corrected by
 580    swapping operands of father node of this one, return 1; if everything is
 581    ok return 0.  */
 582 static int
 583 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 584                              bool *skip_args,
 585                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 586                              vec<slp_oprnd_info> *oprnds_info)
 587 {
 588   stmt_vec_info stmt_info = stmts[stmt_num];
 589   tree oprnd;
 590   unsigned int i, number_of_oprnds;
 591   enum vect_def_type dt = vect_uninitialized_def;
 592   slp_oprnd_info oprnd_info;
 593   unsigned int commutative_op = -1U;
 594   bool first = stmt_num == 0;
 595
 596   if (!is_a<gcall *> (stmt_info->stmt)
 597       && !is_a<gassign *> (stmt_info->stmt)
 598       && !is_a<gphi *> (stmt_info->stmt))
 599     return -1;
 600
 601   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 602   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 603   if (map)
 604     number_of_oprnds = *map++;
 605   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 606     {
 607       if (gimple_call_internal_p (stmt))
 608         {
 609           internal_fn ifn = gimple_call_internal_fn (stmt);
 610           commutative_op = first_commutative_argument (ifn);
 611         }
 612     }
 613   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 614     {
 615       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 616         commutative_op = 0;
 617     }
 618
 619   bool swapped = (swap != 0);
 620   bool backedge = false;
 621   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 622   for (i = 0; i < number_of_oprnds; i++)
 623     {
 624       int opno = map ? map[i] : int (i);
 625       if (opno < 0)
 626         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 627       else
 628         {
 629           oprnd = gimple_arg (stmt_info->stmt, opno);
 630           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 631             {
 632               edge e = gimple_phi_arg_edge (stmt, opno);
 633               backedge = (is_a <bb_vec_info> (vinfo)
 634                           ? e->flags & EDGE_DFS_BACK
 635                           : dominated_by_p (CDI_DOMINATORS, e->src,
 636                                             gimple_bb (stmt_info->stmt)));
 637             }
 638         }
 639       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 640         oprnd = TREE_OPERAND (oprnd, 0);
 641
 642       oprnd_info = (*oprnds_info)[i];
 643
 644       stmt_vec_info def_stmt_info;
 645       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 646         {
 647           if (dump_enabled_p ())
 648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 649                              "Build SLP failed: can't analyze def for %T\n",
 650                              oprnd);
 651
 652           return -1;
 653         }
 654
 655       if (skip_args[i])
 656         {
 657           oprnd_info->def_stmts.quick_push (NULL);
 658           oprnd_info->ops.quick_push (NULL_TREE);
 659           oprnd_info->first_dt = vect_uninitialized_def;
 660           continue;
 661         }
 662
 663       oprnd_info->def_stmts.quick_push (def_stmt_info);
 664       oprnd_info->ops.quick_push (oprnd);
 665
 666       if (def_stmt_info
 667           && is_pattern_stmt_p (def_stmt_info))
 668         {
 669           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 670               != def_stmt_info)
 671             oprnd_info->any_pattern = true;
 672           else
 673             /* If we promote this to external use the original stmt def.  */
 674             oprnd_info->ops.last ()
 675               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 676         }
 677
 678       /* If there's a extern def on a backedge make sure we can
 679          code-generate at the region start.
 680          ???  This is another case that could be fixed by adjusting
 681          how we split the function but at the moment we'd have conflicting
 682          goals there.  */
 683       if (backedge
 684           && dts[i] == vect_external_def
 685           && is_a <bb_vec_info> (vinfo)
 686           && TREE_CODE (oprnd) == SSA_NAME
 687           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 688           && !dominated_by_p (CDI_DOMINATORS,
 689                               as_a <bb_vec_info> (vinfo)->bbs[0],
 690                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 691         {
 692           if (dump_enabled_p ())
 693             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 694                              "Build SLP failed: extern def %T only defined "
 695                              "on backedge\n", oprnd);
 696           return -1;
 697         }
 698
 699       if (first)
 700         {
 701           tree type = TREE_TYPE (oprnd);
 702           dt = dts[i];
 703           if ((dt == vect_constant_def
 704                || dt == vect_external_def)
 705               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 706               && (TREE_CODE (type) == BOOLEAN_TYPE
 707                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 708                                                       type)))
 709             {
 710               if (dump_enabled_p ())
 711                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                                  "Build SLP failed: invalid type of def "
 713                                  "for variable-length SLP %T\n", oprnd);
 714               return -1;
 715             }
 716
 717           /* For the swapping logic below force vect_reduction_def
 718              for the reduction op in a SLP reduction group.  */
 719           if (!STMT_VINFO_DATA_REF (stmt_info)
 720               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 721               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 722               && def_stmt_info)
 723             dts[i] = dt = vect_reduction_def;
 724
 725           /* Check the types of the definition.  */
 726           switch (dt)
 727             {
 728             case vect_external_def:
 729             case vect_constant_def:
 730             case vect_internal_def:
 731             case vect_reduction_def:
 732             case vect_induction_def:
 733             case vect_nested_cycle:
 734             case vect_first_order_recurrence:
 735               break;
 736
 737             default:
 738               /* FORNOW: Not supported.  */
 739               if (dump_enabled_p ())
 740                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 741                                  "Build SLP failed: illegal type of def %T\n",
 742                                  oprnd);
 743               return -1;
 744             }
 745
 746           oprnd_info->first_dt = dt;
 747           oprnd_info->first_op_type = type;
 748         }
 749     }
 750   if (first)
 751     return 0;
 752
 753   /* Now match the operand definition types to that of the first stmt.  */
 754   for (i = 0; i < number_of_oprnds;)
 755     {
 756       if (skip_args[i])
 757         {
 758           ++i;
 759           continue;
 760         }
 761
 762       oprnd_info = (*oprnds_info)[i];
 763       dt = dts[i];
 764       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 765       oprnd = oprnd_info->ops[stmt_num];
 766       tree type = TREE_TYPE (oprnd);
 767
 768       if (!types_compatible_p (oprnd_info->first_op_type, type))
 769         {
 770           if (dump_enabled_p ())
 771             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 772                              "Build SLP failed: different operand types\n");
 773           return 1;
 774         }
 775
 776       /* Not first stmt of the group, check that the def-stmt/s match
 777          the def-stmt/s of the first stmt.  Allow different definition
 778          types for reduction chains: the first stmt must be a
 779          vect_reduction_def (a phi node), and the rest
 780          end in the reduction chain.  */
 781       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 782            && !(oprnd_info->first_dt == vect_reduction_def
 783                 && !STMT_VINFO_DATA_REF (stmt_info)
 784                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 785                 && def_stmt_info
 786                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 787                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 788                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 789           || (!STMT_VINFO_DATA_REF (stmt_info)
 790               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 791               && ((!def_stmt_info
 792                    || STMT_VINFO_DATA_REF (def_stmt_info)
 793                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 794                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 795                   != (oprnd_info->first_dt != vect_reduction_def))))
 796         {
 797           /* Try swapping operands if we got a mismatch.  For BB
 798              vectorization only in case it will clearly improve things.  */
 799           if (i == commutative_op && !swapped
 800               && (!is_a <bb_vec_info> (vinfo)
 801                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 802                                              dts[i+1])
 803                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 804                           || vect_def_types_match
 805                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 806             {
 807               if (dump_enabled_p ())
 808                 dump_printf_loc (MSG_NOTE, vect_location,
 809                                  "trying swapped operands\n");
 810               std::swap (dts[i], dts[i+1]);
 811               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 812                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 813               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 814                          (*oprnds_info)[i+1]->ops[stmt_num]);
 815               swapped = true;
 816               continue;
 817             }
 818
 819           if (is_a <bb_vec_info> (vinfo)
 820               && !oprnd_info->any_pattern)
 821             {
 822               /* Now for commutative ops we should see whether we can
 823                  make the other operand matching.  */
 824               if (dump_enabled_p ())
 825                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 826                                  "treating operand as external\n");
 827               oprnd_info->first_dt = dt = vect_external_def;
 828             }
 829           else
 830             {
 831               if (dump_enabled_p ())
 832                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 833                                  "Build SLP failed: different types\n");
 834               return 1;
 835             }
 836         }
 837
 838       /* Make sure to demote the overall operand to external.  */
 839       if (dt == vect_external_def)
 840         oprnd_info->first_dt = vect_external_def;
 841       /* For a SLP reduction chain we want to duplicate the reduction to
 842          each of the chain members.  That gets us a sane SLP graph (still
 843          the stmts are not 100% correct wrt the initial values).  */
 844       else if ((dt == vect_internal_def
 845                 || dt == vect_reduction_def)
 846                && oprnd_info->first_dt == vect_reduction_def
 847                && !STMT_VINFO_DATA_REF (stmt_info)
 848                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 849                && !STMT_VINFO_DATA_REF (def_stmt_info)
 850                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 851                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 852         {
 853           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 854           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 855         }
 856
 857       ++i;
 858     }
 859
 860   /* Swap operands.  */
 861   if (swapped)
 862     {
 863       if (dump_enabled_p ())
 864         dump_printf_loc (MSG_NOTE, vect_location,
 865                          "swapped operands to match def types in %G",
 866                          stmt_info->stmt);
 867     }
 868
 869   return 0;
 870 }
 871
 872 /* Return true if call statements CALL1 and CALL2 are similar enough
 873    to be combined into the same SLP group.  */
 874
 875 bool
 876 compatible_calls_p (gcall *call1, gcall *call2)
 877 {
 878   unsigned int nargs = gimple_call_num_args (call1);
 879   if (nargs != gimple_call_num_args (call2))
 880     return false;
 881
 882   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 883     return false;
 884
 885   if (gimple_call_internal_p (call1))
 886     {
 887       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 888                                TREE_TYPE (gimple_call_lhs (call2))))
 889         return false;
 890       for (unsigned int i = 0; i < nargs; ++i)
 891         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 892                                  TREE_TYPE (gimple_call_arg (call2, i))))
 893           return false;
 894     }
 895   else
 896     {
 897       if (!operand_equal_p (gimple_call_fn (call1),
 898                             gimple_call_fn (call2), 0))
 899         return false;
 900
 901       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 902         return false;
 903     }
 904
 905   /* Check that any unvectorized arguments are equal.  */
 906   if (const int *map = vect_get_operand_map (call1))
 907     {
 908       unsigned int nkept = *map++;
 909       unsigned int mapi = 0;
 910       for (unsigned int i = 0; i < nargs; ++i)
 911         if (mapi < nkept && map[mapi] == int (i))
 912           mapi += 1;
 913         else if (!operand_equal_p (gimple_call_arg (call1, i),
 914                                    gimple_call_arg (call2, i)))
 915           return false;
 916     }
 917
 918   return true;
 919 }
 920
 921 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 922    caller's attempt to find the vector type in STMT_INFO with the narrowest
 923    element type.  Return true if VECTYPE is nonnull and if it is valid
 924    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 925    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 926    vect_build_slp_tree.  */
 927
 928 static bool
 929 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 930                         unsigned int group_size,
 931                         tree vectype, poly_uint64 *max_nunits)
 932 {
 933   if (!vectype)
 934     {
 935       if (dump_enabled_p ())
 936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 937                          "Build SLP failed: unsupported data-type in %G\n",
 938                          stmt_info->stmt);
 939       /* Fatal mismatch.  */
 940       return false;
 941     }
 942
 943   /* If populating the vector type requires unrolling then fail
 944      before adjusting *max_nunits for basic-block vectorization.  */
 945   if (is_a <bb_vec_info> (vinfo)
 946       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 947     {
 948       if (dump_enabled_p ())
 949         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 950                          "Build SLP failed: unrolling required "
 951                          "in basic block SLP\n");
 952       /* Fatal mismatch.  */
 953       return false;
 954     }
 955
 956   /* In case of multiple types we need to detect the smallest type.  */
 957   vect_update_max_nunits (max_nunits, vectype);
 958   return true;
 959 }
 960
 961 /* Verify if the scalar stmts STMTS are isomorphic, require data
 962    permutation or are of unsupported types of operation.  Return
 963    true if they are, otherwise return false and indicate in *MATCHES
 964    which stmts are not isomorphic to the first one.  If MATCHES[0]
 965    is false then this indicates the comparison could not be
 966    carried out or the stmts will never be vectorized by SLP.
 967
 968    Note COND_EXPR is possibly isomorphic to another one after swapping its
 969    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 970    the first stmt by swapping the two operands of comparison; set SWAP[i]
 971    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 972    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 973    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 974
 975 static bool
 976 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 977                        vec<stmt_vec_info> stmts, unsigned int group_size,
 978                        poly_uint64 *max_nunits, bool *matches,
 979                        bool *two_operators, tree *node_vectype)
 980 {
 981   unsigned int i;
 982   stmt_vec_info first_stmt_info = stmts[0];
 983   code_helper first_stmt_code = ERROR_MARK;
 984   code_helper alt_stmt_code = ERROR_MARK;
 985   code_helper rhs_code = ERROR_MARK;
 986   code_helper first_cond_code = ERROR_MARK;
 987   tree lhs;
 988   bool need_same_oprnds = false;
 989   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 990   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 991   bool first_stmt_ldst_p = false, ldst_p = false;
 992   bool first_stmt_phi_p = false, phi_p = false;
 993   bool maybe_soft_fail = false;
 994   tree soft_fail_nunits_vectype = NULL_TREE;
 995
 996   /* For every stmt in NODE find its def stmt/s.  */
 997   stmt_vec_info stmt_info;
 998   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 999     {
1000       gimple *stmt = stmt_info->stmt;
1001       swap[i] = 0;
1002       matches[i] = false;
1003
1004       if (dump_enabled_p ())
1005         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1006
1007       /* Fail to vectorize statements marked as unvectorizable, throw
1008          or are volatile.  */
1009       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1010           || stmt_can_throw_internal (cfun, stmt)
1011           || gimple_has_volatile_ops (stmt))
1012         {
1013           if (dump_enabled_p ())
1014             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015                              "Build SLP failed: unvectorizable statement %G",
1016                              stmt);
1017           /* ???  For BB vectorization we want to commutate operands in a way
1018              to shuffle all unvectorizable defs into one operand and have
1019              the other still vectorized.  The following doesn't reliably
1020              work for this though but it's the easiest we can do here.  */
1021           if (is_a <bb_vec_info> (vinfo) && i != 0)
1022             continue;
1023           /* Fatal mismatch.  */
1024           matches[0] = false;
1025           return false;
1026         }
1027
1028       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1029       lhs = gimple_get_lhs (stmt);
1030       if (lhs == NULL_TREE
1031           && (!call_stmt
1032               || !gimple_call_internal_p (stmt)
1033               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1034         {
1035           if (dump_enabled_p ())
1036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1038                              "GIMPLE_CALL %G", stmt);
1039           if (is_a <bb_vec_info> (vinfo) && i != 0)
1040             continue;
1041           /* Fatal mismatch.  */
1042           matches[0] = false;
1043           return false;
1044         }
1045
1046       tree nunits_vectype;
1047       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1048                                            &nunits_vectype, group_size))
1049         {
1050           if (is_a <bb_vec_info> (vinfo) && i != 0)
1051             continue;
1052           /* Fatal mismatch.  */
1053           matches[0] = false;
1054           return false;
1055         }
1056       /* Record nunits required but continue analysis, producing matches[]
1057          as if nunits was not an issue.  This allows splitting of groups
1058          to happen.  */
1059       if (nunits_vectype
1060           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1061                                       nunits_vectype, max_nunits))
1062         {
1063           gcc_assert (is_a <bb_vec_info> (vinfo));
1064           maybe_soft_fail = true;
1065           soft_fail_nunits_vectype = nunits_vectype;
1066         }
1067
1068       gcc_assert (vectype);
1069
1070       if (call_stmt)
1071         {
1072           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1073           if (cfn != CFN_LAST)
1074             rhs_code = cfn;
1075           else
1076             rhs_code = CALL_EXPR;
1077
1078           if (cfn == CFN_MASK_LOAD
1079               || cfn == CFN_GATHER_LOAD
1080               || cfn == CFN_MASK_GATHER_LOAD)
1081             ldst_p = true;
1082           else if (cfn == CFN_MASK_STORE)
1083             {
1084               ldst_p = true;
1085               rhs_code = CFN_MASK_STORE;
1086             }
1087           else if ((internal_fn_p (cfn)
1088                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1089                    || gimple_call_tail_p (call_stmt)
1090                    || gimple_call_noreturn_p (call_stmt)
1091                    || gimple_call_chain (call_stmt))
1092             {
1093               if (dump_enabled_p ())
1094                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095                                  "Build SLP failed: unsupported call type %G",
1096                                  (gimple *) call_stmt);
1097               if (is_a <bb_vec_info> (vinfo) && i != 0)
1098                 continue;
1099               /* Fatal mismatch.  */
1100               matches[0] = false;
1101               return false;
1102             }
1103         }
1104       else if (gimple_code (stmt) == GIMPLE_PHI)
1105         {
1106           rhs_code = ERROR_MARK;
1107           phi_p = true;
1108         }
1109       else
1110         {
1111           rhs_code = gimple_assign_rhs_code (stmt);
1112           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1113         }
1114
1115       /* Check the operation.  */
1116       if (i == 0)
1117         {
1118           *node_vectype = vectype;
1119           first_stmt_code = rhs_code;
1120           first_stmt_ldst_p = ldst_p;
1121           first_stmt_phi_p = phi_p;
1122
1123           /* Shift arguments should be equal in all the packed stmts for a
1124              vector shift with scalar shift operand.  */
1125           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1126               || rhs_code == LROTATE_EXPR
1127               || rhs_code == RROTATE_EXPR)
1128             {
1129               /* First see if we have a vector/vector shift.  */
1130               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1131                 {
1132                   /* No vector/vector shift, try for a vector/scalar shift.  */
1133                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1134                     {
1135                       if (dump_enabled_p ())
1136                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                                          "Build SLP failed: "
1138                                          "op not supported by target.\n");
1139                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1140                         continue;
1141                       /* Fatal mismatch.  */
1142                       matches[0] = false;
1143                       return false;
1144                     }
1145                   need_same_oprnds = true;
1146                   first_op1 = gimple_assign_rhs2 (stmt);
1147                 }
1148             }
1149           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1150             {
1151               need_same_oprnds = true;
1152               first_op1 = gimple_assign_rhs2 (stmt);
1153             }
1154           else if (!ldst_p
1155                    && rhs_code == BIT_FIELD_REF)
1156             {
1157               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1158               if (!is_a <bb_vec_info> (vinfo)
1159                   || TREE_CODE (vec) != SSA_NAME
1160                   /* When the element types are not compatible we pun the
1161                      source to the target vectype which requires equal size.  */
1162                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1163                        || !types_compatible_p (TREE_TYPE (vectype),
1164                                                TREE_TYPE (TREE_TYPE (vec))))
1165                       && !operand_equal_p (TYPE_SIZE (vectype),
1166                                            TYPE_SIZE (TREE_TYPE (vec)))))
1167                 {
1168                   if (dump_enabled_p ())
1169                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                                      "Build SLP failed: "
1171                                      "BIT_FIELD_REF not supported\n");
1172                   /* Fatal mismatch.  */
1173                   matches[0] = false;
1174                   return false;
1175                 }
1176             }
1177           else if (rhs_code == CFN_DIV_POW2)
1178             {
1179               need_same_oprnds = true;
1180               first_op1 = gimple_call_arg (call_stmt, 1);
1181             }
1182         }
1183       else
1184         {
1185           if (first_stmt_code != rhs_code
1186               && alt_stmt_code == ERROR_MARK)
1187             alt_stmt_code = rhs_code;
1188           if ((first_stmt_code != rhs_code
1189                && (first_stmt_code != IMAGPART_EXPR
1190                    || rhs_code != REALPART_EXPR)
1191                && (first_stmt_code != REALPART_EXPR
1192                    || rhs_code != IMAGPART_EXPR)
1193                /* Handle mismatches in plus/minus by computing both
1194                   and merging the results.  */
1195                && !((first_stmt_code == PLUS_EXPR
1196                      || first_stmt_code == MINUS_EXPR)
1197                     && (alt_stmt_code == PLUS_EXPR
1198                         || alt_stmt_code == MINUS_EXPR)
1199                     && rhs_code == alt_stmt_code)
1200                && !(first_stmt_code.is_tree_code ()
1201                     && rhs_code.is_tree_code ()
1202                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1203                         == tcc_comparison)
1204                     && (swap_tree_comparison (tree_code (first_stmt_code))
1205                         == tree_code (rhs_code)))
1206                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1207                     && (first_stmt_code == ARRAY_REF
1208                         || first_stmt_code == BIT_FIELD_REF
1209                         || first_stmt_code == INDIRECT_REF
1210                         || first_stmt_code == COMPONENT_REF
1211                         || first_stmt_code == MEM_REF)
1212                     && (rhs_code == ARRAY_REF
1213                         || rhs_code == BIT_FIELD_REF
1214                         || rhs_code == INDIRECT_REF
1215                         || rhs_code == COMPONENT_REF
1216                         || rhs_code == MEM_REF)))
1217               || first_stmt_ldst_p != ldst_p
1218               || first_stmt_phi_p != phi_p)
1219             {
1220               if (dump_enabled_p ())
1221                 {
1222                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                                    "Build SLP failed: different operation "
1224                                    "in stmt %G", stmt);
1225                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226                                    "original stmt %G", first_stmt_info->stmt);
1227                 }
1228               /* Mismatch.  */
1229               continue;
1230             }
1231
1232           if (!ldst_p
1233               && first_stmt_code == BIT_FIELD_REF
1234               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1235                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1236             {
1237               if (dump_enabled_p ())
1238                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239                                  "Build SLP failed: different BIT_FIELD_REF "
1240                                  "arguments in %G", stmt);
1241               /* Mismatch.  */
1242               continue;
1243             }
1244
1245           if (call_stmt
1246               && first_stmt_code != CFN_MASK_LOAD
1247               && first_stmt_code != CFN_MASK_STORE)
1248             {
1249               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1250                                        call_stmt))
1251                 {
1252                   if (dump_enabled_p ())
1253                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                                      "Build SLP failed: different calls in %G",
1255                                      stmt);
1256                   /* Mismatch.  */
1257                   continue;
1258                 }
1259             }
1260
1261           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1262               && (gimple_bb (first_stmt_info->stmt)
1263                   != gimple_bb (stmt_info->stmt)))
1264             {
1265               if (dump_enabled_p ())
1266                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267                                  "Build SLP failed: different BB for PHI "
1268                                  "or possibly trapping operation in %G", stmt);
1269               /* Mismatch.  */
1270               continue;
1271             }
1272
1273           if (need_same_oprnds)
1274             {
1275               tree other_op1 = gimple_arg (stmt, 1);
1276               if (!operand_equal_p (first_op1, other_op1, 0))
1277                 {
1278                   if (dump_enabled_p ())
1279                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1280                                      "Build SLP failed: different shift "
1281                                      "arguments in %G", stmt);
1282                   /* Mismatch.  */
1283                   continue;
1284                 }
1285             }
1286
1287           if (!types_compatible_p (vectype, *node_vectype))
1288             {
1289               if (dump_enabled_p ())
1290                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                                  "Build SLP failed: different vector type "
1292                                  "in %G", stmt);
1293               /* Mismatch.  */
1294               continue;
1295             }
1296         }
1297
1298       /* Grouped store or load.  */
1299       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1300         {
1301           gcc_assert (ldst_p);
1302           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1303             {
1304               /* Store.  */
1305               gcc_assert (rhs_code == CFN_MASK_STORE
1306                           || REFERENCE_CLASS_P (lhs)
1307                           || DECL_P (lhs));
1308             }
1309           else
1310             {
1311               /* Load.  */
1312               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1313               if (prev_first_load)
1314                 {
1315                   /* Check that there are no loads from different interleaving
1316                      chains in the same node.  */
1317                   if (prev_first_load != first_load)
1318                     {
1319                       if (dump_enabled_p ())
1320                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1321                                          vect_location,
1322                                          "Build SLP failed: different "
1323                                          "interleaving chains in one node %G",
1324                                          stmt);
1325                       /* Mismatch.  */
1326                       continue;
1327                     }
1328                 }
1329               else
1330                 prev_first_load = first_load;
1331            }
1332         }
1333       /* Non-grouped store or load.  */
1334       else if (ldst_p)
1335         {
1336           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1337               && rhs_code != CFN_GATHER_LOAD
1338               && rhs_code != CFN_MASK_GATHER_LOAD
1339               /* Not grouped loads are handled as externals for BB
1340                  vectorization.  For loop vectorization we can handle
1341                  splats the same we handle single element interleaving.  */
1342               && (is_a <bb_vec_info> (vinfo)
1343                   || stmt_info != first_stmt_info
1344                   || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1345             {
1346               /* Not grouped load.  */
1347               if (dump_enabled_p ())
1348                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349                                  "Build SLP failed: not grouped load %G", stmt);
1350
1351               if (i != 0)
1352                 continue;
1353               /* Fatal mismatch.  */
1354               matches[0] = false;
1355               return false;
1356             }
1357         }
1358       /* Not memory operation.  */
1359       else
1360         {
1361           if (!phi_p
1362               && rhs_code.is_tree_code ()
1363               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1364               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1365               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1366               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1367               && rhs_code != VIEW_CONVERT_EXPR
1368               && rhs_code != CALL_EXPR
1369               && rhs_code != BIT_FIELD_REF)
1370             {
1371               if (dump_enabled_p ())
1372                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1373                                  "Build SLP failed: operation unsupported %G",
1374                                  stmt);
1375               if (is_a <bb_vec_info> (vinfo) && i != 0)
1376                 continue;
1377               /* Fatal mismatch.  */
1378               matches[0] = false;
1379               return false;
1380             }
1381
1382           if (rhs_code == COND_EXPR)
1383             {
1384               tree cond_expr = gimple_assign_rhs1 (stmt);
1385               enum tree_code cond_code = TREE_CODE (cond_expr);
1386               enum tree_code swap_code = ERROR_MARK;
1387               enum tree_code invert_code = ERROR_MARK;
1388
1389               if (i == 0)
1390                 first_cond_code = TREE_CODE (cond_expr);
1391               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1392                 {
1393                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1394                   swap_code = swap_tree_comparison (cond_code);
1395                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1396                 }
1397
1398               if (first_cond_code == cond_code)
1399                 ;
1400               /* Isomorphic can be achieved by swapping.  */
1401               else if (first_cond_code == swap_code)
1402                 swap[i] = 1;
1403               /* Isomorphic can be achieved by inverting.  */
1404               else if (first_cond_code == invert_code)
1405                 swap[i] = 2;
1406               else
1407                 {
1408                   if (dump_enabled_p ())
1409                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410                                      "Build SLP failed: different"
1411                                      " operation %G", stmt);
1412                   /* Mismatch.  */
1413                   continue;
1414                 }
1415             }
1416
1417           if (rhs_code.is_tree_code ()
1418               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1419               && (swap_tree_comparison ((tree_code)first_stmt_code)
1420                   == (tree_code)rhs_code))
1421             swap[i] = 1;
1422         }
1423
1424       matches[i] = true;
1425     }
1426
1427   for (i = 0; i < group_size; ++i)
1428     if (!matches[i])
1429       return false;
1430
1431   /* If we allowed a two-operation SLP node verify the target can cope
1432      with the permute we are going to use.  */
1433   if (alt_stmt_code != ERROR_MARK
1434       && (!alt_stmt_code.is_tree_code ()
1435           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1436               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1437     {
1438       *two_operators = true;
1439     }
1440
1441   if (maybe_soft_fail)
1442     {
1443       unsigned HOST_WIDE_INT const_nunits;
1444       if (!TYPE_VECTOR_SUBPARTS
1445             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1446           || const_nunits > group_size)
1447         matches[0] = false;
1448       else
1449         {
1450           /* With constant vector elements simulate a mismatch at the
1451              point we need to split.  */
1452           unsigned tail = group_size & (const_nunits - 1);
1453           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1454         }
1455       return false;
1456     }
1457
1458   return true;
1459 }
1460
1461 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1462    Note we never remove apart from at destruction time so we do not
1463    need a special value for deleted that differs from empty.  */
1464 struct bst_traits
1465 {
1466   typedef vec <stmt_vec_info> value_type;
1467   typedef vec <stmt_vec_info> compare_type;
1468   static inline hashval_t hash (value_type);
1469   static inline bool equal (value_type existing, value_type candidate);
1470   static inline bool is_empty (value_type x) { return !x.exists (); }
1471   static inline bool is_deleted (value_type x) { return !x.exists (); }
1472   static const bool empty_zero_p = true;
1473   static inline void mark_empty (value_type &x) { x.release (); }
1474   static inline void mark_deleted (value_type &x) { x.release (); }
1475   static inline void remove (value_type &x) { x.release (); }
1476 };
1477 inline hashval_t
1478 bst_traits::hash (value_type x)
1479 {
1480   inchash::hash h;
1481   for (unsigned i = 0; i < x.length (); ++i)
1482     h.add_int (gimple_uid (x[i]->stmt));
1483   return h.end ();
1484 }
1485 inline bool
1486 bst_traits::equal (value_type existing, value_type candidate)
1487 {
1488   if (existing.length () != candidate.length ())
1489     return false;
1490   for (unsigned i = 0; i < existing.length (); ++i)
1491     if (existing[i] != candidate[i])
1492       return false;
1493   return true;
1494 }
1495
1496 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1497    but then vec::insert does memmove and that's not compatible with
1498    std::pair.  */
1499 struct chain_op_t
1500 {
1501   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1502       : code (code_), dt (dt_), op (op_) {}
1503   tree_code code;
1504   vect_def_type dt;
1505   tree op;
1506 };
1507
1508 /* Comparator for sorting associatable chains.  */
1509
1510 static int
1511 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1512 {
1513   auto *op1 = (const chain_op_t *) op1_;
1514   auto *op2 = (const chain_op_t *) op2_;
1515   if (op1->dt != op2->dt)
1516     return (int)op1->dt - (int)op2->dt;
1517   return (int)op1->code - (int)op2->code;
1518 }
1519
1520 /* Linearize the associatable expression chain at START with the
1521    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1522    filling CHAIN with the result and using WORKLIST as intermediate storage.
1523    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1524    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1525    stmts, starting with START.  */
1526
1527 static void
1528 vect_slp_linearize_chain (vec_info *vinfo,
1529                           vec<std::pair<tree_code, gimple *> > &worklist,
1530                           vec<chain_op_t> &chain,
1531                           enum tree_code code, gimple *start,
1532                           gimple *&code_stmt, gimple *&alt_code_stmt,
1533                           vec<gimple *> *chain_stmts)
1534 {
1535   /* For each lane linearize the addition/subtraction (or other
1536      uniform associatable operation) expression tree.  */
1537   worklist.safe_push (std::make_pair (code, start));
1538   while (!worklist.is_empty ())
1539     {
1540       auto entry = worklist.pop ();
1541       gassign *stmt = as_a <gassign *> (entry.second);
1542       enum tree_code in_code = entry.first;
1543       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1544       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1545       if (!code_stmt
1546           && gimple_assign_rhs_code (stmt) == code)
1547         code_stmt = stmt;
1548       else if (!alt_code_stmt
1549                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1550         alt_code_stmt = stmt;
1551       if (chain_stmts)
1552         chain_stmts->safe_push (stmt);
1553       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1554         {
1555           tree op = gimple_op (stmt, opnum);
1556           vect_def_type dt;
1557           stmt_vec_info def_stmt_info;
1558           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1559           gcc_assert (res);
1560           if (dt == vect_internal_def
1561               && is_pattern_stmt_p (def_stmt_info))
1562             op = gimple_get_lhs (def_stmt_info->stmt);
1563           gimple *use_stmt;
1564           use_operand_p use_p;
1565           if (dt == vect_internal_def
1566               && single_imm_use (op, &use_p, &use_stmt)
1567               && is_gimple_assign (def_stmt_info->stmt)
1568               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1569                   || (code == PLUS_EXPR
1570                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1571                           == MINUS_EXPR))))
1572             {
1573               tree_code op_def_code = this_code;
1574               if (op_def_code == MINUS_EXPR && opnum == 1)
1575                 op_def_code = PLUS_EXPR;
1576               if (in_code == MINUS_EXPR)
1577                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1578               worklist.safe_push (std::make_pair (op_def_code,
1579                                                   def_stmt_info->stmt));
1580             }
1581           else
1582             {
1583               tree_code op_def_code = this_code;
1584               if (op_def_code == MINUS_EXPR && opnum == 1)
1585                 op_def_code = PLUS_EXPR;
1586               if (in_code == MINUS_EXPR)
1587                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1588               chain.safe_push (chain_op_t (op_def_code, dt, op));
1589             }
1590         }
1591     }
1592 }
1593
1594 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1595                   simple_hashmap_traits <bst_traits, slp_tree> >
1596   scalar_stmts_to_slp_tree_map_t;
1597
1598 static slp_tree
1599 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1600                        vec<stmt_vec_info> stmts, unsigned int group_size,
1601                        poly_uint64 *max_nunits,
1602                        bool *matches, unsigned *limit, unsigned *tree_size,
1603                        scalar_stmts_to_slp_tree_map_t *bst_map);
1604
1605 static slp_tree
1606 vect_build_slp_tree (vec_info *vinfo,
1607                      vec<stmt_vec_info> stmts, unsigned int group_size,
1608                      poly_uint64 *max_nunits,
1609                      bool *matches, unsigned *limit, unsigned *tree_size,
1610                      scalar_stmts_to_slp_tree_map_t *bst_map)
1611 {
1612   if (slp_tree *leader = bst_map->get (stmts))
1613     {
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1616                          !(*leader)->failed ? "" : "failed ",
1617                          (void *) *leader);
1618       if (!(*leader)->failed)
1619         {
1620           SLP_TREE_REF_COUNT (*leader)++;
1621           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1622           stmts.release ();
1623           return *leader;
1624         }
1625       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1626       return NULL;
1627     }
1628
1629   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1630      so we can pick up backedge destinations during discovery.  */
1631   slp_tree res = new _slp_tree;
1632   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1633   SLP_TREE_SCALAR_STMTS (res) = stmts;
1634   bst_map->put (stmts.copy (), res);
1635
1636   if (*limit == 0)
1637     {
1638       if (dump_enabled_p ())
1639         dump_printf_loc (MSG_NOTE, vect_location,
1640                          "SLP discovery limit exceeded\n");
1641       /* Mark the node invalid so we can detect those when still in use
1642          as backedge destinations.  */
1643       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1644       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1645       res->failed = XNEWVEC (bool, group_size);
1646       memset (res->failed, 0, sizeof (bool) * group_size);
1647       memset (matches, 0, sizeof (bool) * group_size);
1648       return NULL;
1649     }
1650   --*limit;
1651
1652   if (dump_enabled_p ())
1653     dump_printf_loc (MSG_NOTE, vect_location,
1654                      "starting SLP discovery for node %p\n", (void *) res);
1655
1656   poly_uint64 this_max_nunits = 1;
1657   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1658                                         &this_max_nunits,
1659                                         matches, limit, tree_size, bst_map);
1660   if (!res_)
1661     {
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_NOTE, vect_location,
1664                          "SLP discovery for node %p failed\n", (void *) res);
1665       /* Mark the node invalid so we can detect those when still in use
1666          as backedge destinations.  */
1667       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1668       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1669       res->failed = XNEWVEC (bool, group_size);
1670       if (flag_checking)
1671         {
1672           unsigned i;
1673           for (i = 0; i < group_size; ++i)
1674             if (!matches[i])
1675               break;
1676           gcc_assert (i < group_size);
1677         }
1678       memcpy (res->failed, matches, sizeof (bool) * group_size);
1679     }
1680   else
1681     {
1682       if (dump_enabled_p ())
1683         dump_printf_loc (MSG_NOTE, vect_location,
1684                          "SLP discovery for node %p succeeded\n",
1685                          (void *) res);
1686       gcc_assert (res_ == res);
1687       res->max_nunits = this_max_nunits;
1688       vect_update_max_nunits (max_nunits, this_max_nunits);
1689       /* Keep a reference for the bst_map use.  */
1690       SLP_TREE_REF_COUNT (res)++;
1691     }
1692   return res_;
1693 }
1694
1695 /* Helper for building an associated SLP node chain.  */
1696
1697 static void
1698 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1699                                    slp_tree op0, slp_tree op1,
1700                                    stmt_vec_info oper1, stmt_vec_info oper2,
1701                                    vec<std::pair<unsigned, unsigned> > lperm)
1702 {
1703   unsigned group_size = SLP_TREE_LANES (op1);
1704
1705   slp_tree child1 = new _slp_tree;
1706   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1707   SLP_TREE_VECTYPE (child1) = vectype;
1708   SLP_TREE_LANES (child1) = group_size;
1709   SLP_TREE_CHILDREN (child1).create (2);
1710   SLP_TREE_CHILDREN (child1).quick_push (op0);
1711   SLP_TREE_CHILDREN (child1).quick_push (op1);
1712   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1713
1714   slp_tree child2 = new _slp_tree;
1715   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1716   SLP_TREE_VECTYPE (child2) = vectype;
1717   SLP_TREE_LANES (child2) = group_size;
1718   SLP_TREE_CHILDREN (child2).create (2);
1719   SLP_TREE_CHILDREN (child2).quick_push (op0);
1720   SLP_TREE_REF_COUNT (op0)++;
1721   SLP_TREE_CHILDREN (child2).quick_push (op1);
1722   SLP_TREE_REF_COUNT (op1)++;
1723   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1724
1725   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1726   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1727   SLP_TREE_VECTYPE (perm) = vectype;
1728   SLP_TREE_LANES (perm) = group_size;
1729   /* ???  We should set this NULL but that's not expected.  */
1730   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1731   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1732   SLP_TREE_CHILDREN (perm).quick_push (child1);
1733   SLP_TREE_CHILDREN (perm).quick_push (child2);
1734 }
1735
1736 /* Recursively build an SLP tree starting from NODE.
1737    Fail (and return a value not equal to zero) if def-stmts are not
1738    isomorphic, require data permutation or are of unsupported types of
1739    operation.  Otherwise, return 0.
1740    The value returned is the depth in the SLP tree where a mismatch
1741    was found.  */
1742
1743 static slp_tree
1744 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1745                        vec<stmt_vec_info> stmts, unsigned int group_size,
1746                        poly_uint64 *max_nunits,
1747                        bool *matches, unsigned *limit, unsigned *tree_size,
1748                        scalar_stmts_to_slp_tree_map_t *bst_map)
1749 {
1750   unsigned nops, i, this_tree_size = 0;
1751   poly_uint64 this_max_nunits = *max_nunits;
1752
1753   matches[0] = false;
1754
1755   stmt_vec_info stmt_info = stmts[0];
1756   if (!is_a<gcall *> (stmt_info->stmt)
1757       && !is_a<gassign *> (stmt_info->stmt)
1758       && !is_a<gphi *> (stmt_info->stmt))
1759     return NULL;
1760
1761   nops = gimple_num_args (stmt_info->stmt);
1762   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1763     nops = map[0];
1764
1765   /* If the SLP node is a PHI (induction or reduction), terminate
1766      the recursion.  */
1767   bool *skip_args = XALLOCAVEC (bool, nops);
1768   memset (skip_args, 0, sizeof (bool) * nops);
1769   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1770     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1771       {
1772         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1773         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1774                                                     group_size);
1775         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1776                                      max_nunits))
1777           return NULL;
1778
1779         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1780         if (def_type == vect_induction_def)
1781           {
1782             /* Induction PHIs are not cycles but walk the initial
1783                value.  Only for inner loops through, for outer loops
1784                we need to pick up the value from the actual PHIs
1785                to more easily support peeling and epilogue vectorization.  */
1786             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1787             if (!nested_in_vect_loop_p (loop, stmt_info))
1788               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1789             else
1790               loop = loop->inner;
1791             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1792           }
1793         else if (def_type == vect_reduction_def
1794                  || def_type == vect_double_reduction_def
1795                  || def_type == vect_nested_cycle
1796                  || def_type == vect_first_order_recurrence)
1797           {
1798             /* Else def types have to match.  */
1799             stmt_vec_info other_info;
1800             bool all_same = true;
1801             FOR_EACH_VEC_ELT (stmts, i, other_info)
1802               {
1803                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1804                   return NULL;
1805                 if (other_info != stmt_info)
1806                   all_same = false;
1807               }
1808             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1809             /* Reduction initial values are not explicitely represented.  */
1810             if (def_type != vect_first_order_recurrence
1811                 && !nested_in_vect_loop_p (loop, stmt_info))
1812               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1813             /* Reduction chain backedge defs are filled manually.
1814                ???  Need a better way to identify a SLP reduction chain PHI.
1815                Or a better overall way to SLP match those.  */
1816             if (all_same && def_type == vect_reduction_def)
1817               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1818           }
1819         else if (def_type != vect_internal_def)
1820           return NULL;
1821       }
1822
1823
1824   bool two_operators = false;
1825   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1826   tree vectype = NULL_TREE;
1827   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1828                               &this_max_nunits, matches, &two_operators,
1829                               &vectype))
1830     return NULL;
1831
1832   /* If the SLP node is a load, terminate the recursion unless masked.  */
1833   if (STMT_VINFO_DATA_REF (stmt_info)
1834       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1835     {
1836       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1837         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1838                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1839                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1840       else
1841         {
1842           *max_nunits = this_max_nunits;
1843           (*tree_size)++;
1844           node = vect_create_new_slp_node (node, stmts, 0);
1845           SLP_TREE_VECTYPE (node) = vectype;
1846           /* And compute the load permutation.  Whether it is actually
1847              a permutation depends on the unrolling factor which is
1848              decided later.  */
1849           vec<unsigned> load_permutation;
1850           int j;
1851           stmt_vec_info load_info;
1852           load_permutation.create (group_size);
1853           stmt_vec_info first_stmt_info
1854             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1855           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1856             {
1857               int load_place;
1858               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1859                 load_place = vect_get_place_in_interleaving_chain
1860                                 (load_info, first_stmt_info);
1861               else
1862                 load_place = 0;
1863               gcc_assert (load_place != -1);
1864               load_permutation.safe_push (load_place);
1865             }
1866           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1867           return node;
1868         }
1869     }
1870   else if (gimple_assign_single_p (stmt_info->stmt)
1871            && !gimple_vuse (stmt_info->stmt)
1872            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1873     {
1874       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1875          the same SSA name vector of a compatible type to vectype.  */
1876       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1877       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1878       stmt_vec_info estmt_info;
1879       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1880         {
1881           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1882           tree bfref = gimple_assign_rhs1 (estmt);
1883           HOST_WIDE_INT lane;
1884           if (!known_eq (bit_field_size (bfref),
1885                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1886               || !constant_multiple_p (bit_field_offset (bfref),
1887                                        bit_field_size (bfref), &lane))
1888             {
1889               lperm.release ();
1890               matches[0] = false;
1891               return NULL;
1892             }
1893           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1894         }
1895       slp_tree vnode = vect_create_new_slp_node (vNULL);
1896       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1897         /* ???  We record vectype here but we hide eventually necessary
1898            punning and instead rely on code generation to materialize
1899            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1900            this explicit somehow.  */
1901         SLP_TREE_VECTYPE (vnode) = vectype;
1902       else
1903         {
1904           /* For different size but compatible elements we can still
1905              use VEC_PERM_EXPR without punning.  */
1906           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1907                       && types_compatible_p (TREE_TYPE (vectype),
1908                                              TREE_TYPE (TREE_TYPE (vec))));
1909           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1910         }
1911       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1912       unsigned HOST_WIDE_INT const_nunits;
1913       if (nunits.is_constant (&const_nunits))
1914         SLP_TREE_LANES (vnode) = const_nunits;
1915       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1916       /* We are always building a permutation node even if it is an identity
1917          permute to shield the rest of the vectorizer from the odd node
1918          representing an actual vector without any scalar ops.
1919          ???  We could hide it completely with making the permute node
1920          external?  */
1921       node = vect_create_new_slp_node (node, stmts, 1);
1922       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1923       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1924       SLP_TREE_VECTYPE (node) = vectype;
1925       SLP_TREE_CHILDREN (node).quick_push (vnode);
1926       return node;
1927     }
1928   /* When discovery reaches an associatable operation see whether we can
1929      improve that to match up lanes in a way superior to the operand
1930      swapping code which at most looks at two defs.
1931      ???  For BB vectorization we cannot do the brute-force search
1932      for matching as we can succeed by means of builds from scalars
1933      and have no good way to "cost" one build against another.  */
1934   else if (is_a <loop_vec_info> (vinfo)
1935            /* ???  We don't handle !vect_internal_def defs below.  */
1936            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1937            && is_gimple_assign (stmt_info->stmt)
1938            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1939                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1940            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1941                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1942                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1943     {
1944       /* See if we have a chain of (mixed) adds or subtracts or other
1945          associatable ops.  */
1946       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1947       if (code == MINUS_EXPR)
1948         code = PLUS_EXPR;
1949       stmt_vec_info other_op_stmt_info = NULL;
1950       stmt_vec_info op_stmt_info = NULL;
1951       unsigned chain_len = 0;
1952       auto_vec<chain_op_t> chain;
1953       auto_vec<std::pair<tree_code, gimple *> > worklist;
1954       auto_vec<vec<chain_op_t> > chains (group_size);
1955       auto_vec<slp_tree, 4> children;
1956       bool hard_fail = true;
1957       for (unsigned lane = 0; lane < group_size; ++lane)
1958         {
1959           /* For each lane linearize the addition/subtraction (or other
1960              uniform associatable operation) expression tree.  */
1961           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1962           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1963                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1964                                     NULL);
1965           if (!op_stmt_info && op_stmt)
1966             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1967           if (!other_op_stmt_info && other_op_stmt)
1968             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1969           if (chain.length () == 2)
1970             {
1971               /* In a chain of just two elements resort to the regular
1972                  operand swapping scheme.  If we run into a length
1973                  mismatch still hard-FAIL.  */
1974               if (chain_len == 0)
1975                 hard_fail = false;
1976               else
1977                 {
1978                   matches[lane] = false;
1979                   /* ???  We might want to process the other lanes, but
1980                      make sure to not give false matching hints to the
1981                      caller for lanes we did not process.  */
1982                   if (lane != group_size - 1)
1983                     matches[0] = false;
1984                 }
1985               break;
1986             }
1987           else if (chain_len == 0)
1988             chain_len = chain.length ();
1989           else if (chain.length () != chain_len)
1990             {
1991               /* ???  Here we could slip in magic to compensate with
1992                  neutral operands.  */
1993               matches[lane] = false;
1994               if (lane != group_size - 1)
1995                 matches[0] = false;
1996               break;
1997             }
1998           chains.quick_push (chain.copy ());
1999           chain.truncate (0);
2000         }
2001       if (chains.length () == group_size)
2002         {
2003           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2004           if (!op_stmt_info)
2005             {
2006               hard_fail = false;
2007               goto out;
2008             }
2009           /* Now we have a set of chains with the same length.  */
2010           /* 1. pre-sort according to def_type and operation.  */
2011           for (unsigned lane = 0; lane < group_size; ++lane)
2012             chains[lane].stablesort (dt_sort_cmp, vinfo);
2013           if (dump_enabled_p ())
2014             {
2015               dump_printf_loc (MSG_NOTE, vect_location,
2016                                "pre-sorted chains of %s\n",
2017                                get_tree_code_name (code));
2018               for (unsigned lane = 0; lane < group_size; ++lane)
2019                 {
2020                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2021                     dump_printf (MSG_NOTE, "%s %T ",
2022                                  get_tree_code_name (chains[lane][opnum].code),
2023                                  chains[lane][opnum].op);
2024                   dump_printf (MSG_NOTE, "\n");
2025                 }
2026             }
2027           /* 2. try to build children nodes, associating as necessary.  */
2028           for (unsigned n = 0; n < chain_len; ++n)
2029             {
2030               vect_def_type dt = chains[0][n].dt;
2031               unsigned lane;
2032               for (lane = 0; lane < group_size; ++lane)
2033                 if (chains[lane][n].dt != dt)
2034                   {
2035                     if (dt == vect_constant_def
2036                         && chains[lane][n].dt == vect_external_def)
2037                       dt = vect_external_def;
2038                     else if (dt == vect_external_def
2039                              && chains[lane][n].dt == vect_constant_def)
2040                       ;
2041                     else
2042                       break;
2043                   }
2044               if (lane != group_size)
2045                 {
2046                   if (dump_enabled_p ())
2047                     dump_printf_loc (MSG_NOTE, vect_location,
2048                                      "giving up on chain due to mismatched "
2049                                      "def types\n");
2050                   matches[lane] = false;
2051                   if (lane != group_size - 1)
2052                     matches[0] = false;
2053                   goto out;
2054                 }
2055               if (dt == vect_constant_def
2056                   || dt == vect_external_def)
2057                 {
2058                   /* Check whether we can build the invariant.  If we can't
2059                      we never will be able to.  */
2060                   tree type = TREE_TYPE (chains[0][n].op);
2061                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2062                       && (TREE_CODE (type) == BOOLEAN_TYPE
2063                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2064                                                               type)))
2065                     {
2066                       matches[0] = false;
2067                       goto out;
2068                     }
2069                   vec<tree> ops;
2070                   ops.create (group_size);
2071                   for (lane = 0; lane < group_size; ++lane)
2072                     ops.quick_push (chains[lane][n].op);
2073                   slp_tree child = vect_create_new_slp_node (ops);
2074                   SLP_TREE_DEF_TYPE (child) = dt;
2075                   children.safe_push (child);
2076                 }
2077               else if (dt != vect_internal_def)
2078                 {
2079                   /* Not sure, we might need sth special.
2080                      gcc.dg/vect/pr96854.c,
2081                      gfortran.dg/vect/fast-math-pr37021.f90
2082                      and gfortran.dg/vect/pr61171.f trigger.  */
2083                   /* Soft-fail for now.  */
2084                   hard_fail = false;
2085                   goto out;
2086                 }
2087               else
2088                 {
2089                   vec<stmt_vec_info> op_stmts;
2090                   op_stmts.create (group_size);
2091                   slp_tree child = NULL;
2092                   /* Brute-force our way.  We have to consider a lane
2093                      failing after fixing an earlier fail up in the
2094                      SLP discovery recursion.  So track the current
2095                      permute per lane.  */
2096                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2097                   memset (perms, 0, sizeof (unsigned) * group_size);
2098                   do
2099                     {
2100                       op_stmts.truncate (0);
2101                       for (lane = 0; lane < group_size; ++lane)
2102                         op_stmts.quick_push
2103                           (vinfo->lookup_def (chains[lane][n].op));
2104                       child = vect_build_slp_tree (vinfo, op_stmts,
2105                                                    group_size, &this_max_nunits,
2106                                                    matches, limit,
2107                                                    &this_tree_size, bst_map);
2108                       /* ???  We're likely getting too many fatal mismatches
2109                          here so maybe we want to ignore them (but then we
2110                          have no idea which lanes fatally mismatched).  */
2111                       if (child || !matches[0])
2112                         break;
2113                       /* Swap another lane we have not yet matched up into
2114                          lanes that did not match.  If we run out of
2115                          permute possibilities for a lane terminate the
2116                          search.  */
2117                       bool term = false;
2118                       for (lane = 1; lane < group_size; ++lane)
2119                         if (!matches[lane])
2120                           {
2121                             if (n + perms[lane] + 1 == chain_len)
2122                               {
2123                                 term = true;
2124                                 break;
2125                               }
2126                             std::swap (chains[lane][n],
2127                                        chains[lane][n + perms[lane] + 1]);
2128                             perms[lane]++;
2129                           }
2130                       if (term)
2131                         break;
2132                     }
2133                   while (1);
2134                   if (!child)
2135                     {
2136                       if (dump_enabled_p ())
2137                         dump_printf_loc (MSG_NOTE, vect_location,
2138                                          "failed to match up op %d\n", n);
2139                       op_stmts.release ();
2140                       if (lane != group_size - 1)
2141                         matches[0] = false;
2142                       else
2143                         matches[lane] = false;
2144                       goto out;
2145                     }
2146                   if (dump_enabled_p ())
2147                     {
2148                       dump_printf_loc (MSG_NOTE, vect_location,
2149                                        "matched up op %d to\n", n);
2150                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2151                     }
2152                   children.safe_push (child);
2153                 }
2154             }
2155           /* 3. build SLP nodes to combine the chain.  */
2156           for (unsigned lane = 0; lane < group_size; ++lane)
2157             if (chains[lane][0].code != code)
2158               {
2159                 /* See if there's any alternate all-PLUS entry.  */
2160                 unsigned n;
2161                 for (n = 1; n < chain_len; ++n)
2162                   {
2163                     for (lane = 0; lane < group_size; ++lane)
2164                       if (chains[lane][n].code != code)
2165                         break;
2166                     if (lane == group_size)
2167                       break;
2168                   }
2169                 if (n != chain_len)
2170                   {
2171                     /* Swap that in at first position.  */
2172                     std::swap (children[0], children[n]);
2173                     for (lane = 0; lane < group_size; ++lane)
2174                       std::swap (chains[lane][0], chains[lane][n]);
2175                   }
2176                 else
2177                   {
2178                     /* ???  When this triggers and we end up with two
2179                        vect_constant/external_def up-front things break (ICE)
2180                        spectacularly finding an insertion place for the
2181                        all-constant op.  We should have a fully
2182                        vect_internal_def operand though(?) so we can swap
2183                        that into first place and then prepend the all-zero
2184                        constant.  */
2185                     if (dump_enabled_p ())
2186                       dump_printf_loc (MSG_NOTE, vect_location,
2187                                        "inserting constant zero to compensate "
2188                                        "for (partially) negated first "
2189                                        "operand\n");
2190                     chain_len++;
2191                     for (lane = 0; lane < group_size; ++lane)
2192                       chains[lane].safe_insert
2193                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2194                     vec<tree> zero_ops;
2195                     zero_ops.create (group_size);
2196                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2197                     for (lane = 1; lane < group_size; ++lane)
2198                       zero_ops.quick_push (zero_ops[0]);
2199                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2200                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2201                     children.safe_insert (0, zero);
2202                   }
2203                 break;
2204               }
2205           for (unsigned i = 1; i < children.length (); ++i)
2206             {
2207               slp_tree op0 = children[i - 1];
2208               slp_tree op1 = children[i];
2209               bool this_two_op = false;
2210               for (unsigned lane = 0; lane < group_size; ++lane)
2211                 if (chains[lane][i].code != chains[0][i].code)
2212                   {
2213                     this_two_op = true;
2214                     break;
2215                   }
2216               slp_tree child;
2217               if (i == children.length () - 1)
2218                 child = vect_create_new_slp_node (node, stmts, 2);
2219               else
2220                 child = vect_create_new_slp_node (2, ERROR_MARK);
2221               if (this_two_op)
2222                 {
2223                   vec<std::pair<unsigned, unsigned> > lperm;
2224                   lperm.create (group_size);
2225                   for (unsigned lane = 0; lane < group_size; ++lane)
2226                     lperm.quick_push (std::make_pair
2227                       (chains[lane][i].code != chains[0][i].code, lane));
2228                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2229                                                      (chains[0][i].code == code
2230                                                       ? op_stmt_info
2231                                                       : other_op_stmt_info),
2232                                                      (chains[0][i].code == code
2233                                                       ? other_op_stmt_info
2234                                                       : op_stmt_info),
2235                                                      lperm);
2236                 }
2237               else
2238                 {
2239                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2240                   SLP_TREE_VECTYPE (child) = vectype;
2241                   SLP_TREE_LANES (child) = group_size;
2242                   SLP_TREE_CHILDREN (child).quick_push (op0);
2243                   SLP_TREE_CHILDREN (child).quick_push (op1);
2244                   SLP_TREE_REPRESENTATIVE (child)
2245                     = (chains[0][i].code == code
2246                        ? op_stmt_info : other_op_stmt_info);
2247                 }
2248               children[i] = child;
2249             }
2250           *tree_size += this_tree_size + 1;
2251           *max_nunits = this_max_nunits;
2252           while (!chains.is_empty ())
2253             chains.pop ().release ();
2254           return node;
2255         }
2256 out:
2257       while (!children.is_empty ())
2258         vect_free_slp_tree (children.pop ());
2259       while (!chains.is_empty ())
2260         chains.pop ().release ();
2261       /* Hard-fail, otherwise we might run into quadratic processing of the
2262          chains starting one stmt into the chain again.  */
2263       if (hard_fail)
2264         return NULL;
2265       /* Fall thru to normal processing.  */
2266     }
2267
2268   /* Get at the operands, verifying they are compatible.  */
2269   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2270   slp_oprnd_info oprnd_info;
2271   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2272     {
2273       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2274                                              stmts, i, &oprnds_info);
2275       if (res != 0)
2276         matches[(res == -1) ? 0 : i] = false;
2277       if (!matches[0])
2278         break;
2279     }
2280   for (i = 0; i < group_size; ++i)
2281     if (!matches[i])
2282       {
2283         vect_free_oprnd_info (oprnds_info);
2284         return NULL;
2285       }
2286   swap = NULL;
2287
2288   auto_vec<slp_tree, 4> children;
2289
2290   stmt_info = stmts[0];
2291
2292   /* Create SLP_TREE nodes for the definition node/s.  */
2293   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2294     {
2295       slp_tree child;
2296       unsigned int j;
2297
2298       /* We're skipping certain operands from processing, for example
2299          outer loop reduction initial defs.  */
2300       if (skip_args[i])
2301         {
2302           children.safe_push (NULL);
2303           continue;
2304         }
2305
2306       if (oprnd_info->first_dt == vect_uninitialized_def)
2307         {
2308           /* COND_EXPR have one too many eventually if the condition
2309              is a SSA name.  */
2310           gcc_assert (i == 3 && nops == 4);
2311           continue;
2312         }
2313
2314       if (is_a <bb_vec_info> (vinfo)
2315           && oprnd_info->first_dt == vect_internal_def
2316           && !oprnd_info->any_pattern)
2317         {
2318           /* For BB vectorization, if all defs are the same do not
2319              bother to continue the build along the single-lane
2320              graph but use a splat of the scalar value.  */
2321           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2322           for (j = 1; j < group_size; ++j)
2323             if (oprnd_info->def_stmts[j] != first_def)
2324               break;
2325           if (j == group_size
2326               /* But avoid doing this for loads where we may be
2327                  able to CSE things, unless the stmt is not
2328                  vectorizable.  */
2329               && (!STMT_VINFO_VECTORIZABLE (first_def)
2330                   || !gimple_vuse (first_def->stmt)))
2331             {
2332               if (dump_enabled_p ())
2333                 dump_printf_loc (MSG_NOTE, vect_location,
2334                                  "Using a splat of the uniform operand %G",
2335                                  first_def->stmt);
2336               oprnd_info->first_dt = vect_external_def;
2337             }
2338         }
2339
2340       if (oprnd_info->first_dt == vect_external_def
2341           || oprnd_info->first_dt == vect_constant_def)
2342         {
2343           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2344           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2345           oprnd_info->ops = vNULL;
2346           children.safe_push (invnode);
2347           continue;
2348         }
2349
2350       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2351                                         group_size, &this_max_nunits,
2352                                         matches, limit,
2353                                         &this_tree_size, bst_map)) != NULL)
2354         {
2355           oprnd_info->def_stmts = vNULL;
2356           children.safe_push (child);
2357           continue;
2358         }
2359
2360       /* If the SLP build for operand zero failed and operand zero
2361          and one can be commutated try that for the scalar stmts
2362          that failed the match.  */
2363       if (i == 0
2364           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2365           && matches[0]
2366           /* ???  For COND_EXPRs we can swap the comparison operands
2367              as well as the arms under some constraints.  */
2368           && nops == 2
2369           && oprnds_info[1]->first_dt == vect_internal_def
2370           && is_gimple_assign (stmt_info->stmt)
2371           /* Swapping operands for reductions breaks assumptions later on.  */
2372           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2373           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2374         {
2375           /* See whether we can swap the matching or the non-matching
2376              stmt operands.  */
2377           bool swap_not_matching = true;
2378           do
2379             {
2380               for (j = 0; j < group_size; ++j)
2381                 {
2382                   if (matches[j] != !swap_not_matching)
2383                     continue;
2384                   stmt_vec_info stmt_info = stmts[j];
2385                   /* Verify if we can swap operands of this stmt.  */
2386                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2387                   if (!stmt
2388                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2389                     {
2390                       if (!swap_not_matching)
2391                         goto fail;
2392                       swap_not_matching = false;
2393                       break;
2394                     }
2395                 }
2396             }
2397           while (j != group_size);
2398
2399           /* Swap mismatched definition stmts.  */
2400           if (dump_enabled_p ())
2401             dump_printf_loc (MSG_NOTE, vect_location,
2402                              "Re-trying with swapped operands of stmts ");
2403           for (j = 0; j < group_size; ++j)
2404             if (matches[j] == !swap_not_matching)
2405               {
2406                 std::swap (oprnds_info[0]->def_stmts[j],
2407                            oprnds_info[1]->def_stmts[j]);
2408                 std::swap (oprnds_info[0]->ops[j],
2409                            oprnds_info[1]->ops[j]);
2410                 if (dump_enabled_p ())
2411                   dump_printf (MSG_NOTE, "%d ", j);
2412               }
2413           if (dump_enabled_p ())
2414             dump_printf (MSG_NOTE, "\n");
2415           /* After swapping some operands we lost track whether an
2416              operand has any pattern defs so be conservative here.  */
2417           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2418             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2419           /* And try again with scratch 'matches' ... */
2420           bool *tem = XALLOCAVEC (bool, group_size);
2421           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2422                                             group_size, &this_max_nunits,
2423                                             tem, limit,
2424                                             &this_tree_size, bst_map)) != NULL)
2425             {
2426               oprnd_info->def_stmts = vNULL;
2427               children.safe_push (child);
2428               continue;
2429             }
2430         }
2431 fail:
2432
2433       /* If the SLP build failed and we analyze a basic-block
2434          simply treat nodes we fail to build as externally defined
2435          (and thus build vectors from the scalar defs).
2436          The cost model will reject outright expensive cases.
2437          ???  This doesn't treat cases where permutation ultimatively
2438          fails (or we don't try permutation below).  Ideally we'd
2439          even compute a permutation that will end up with the maximum
2440          SLP tree size...  */
2441       if (is_a <bb_vec_info> (vinfo)
2442           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2443              do extra work to cancel the pattern so the uses see the
2444              scalar version.  */
2445           && !is_pattern_stmt_p (stmt_info)
2446           && !oprnd_info->any_pattern)
2447         {
2448           /* But if there's a leading vector sized set of matching stmts
2449              fail here so we can split the group.  This matches the condition
2450              vect_analyze_slp_instance uses.  */
2451           /* ???  We might want to split here and combine the results to support
2452              multiple vector sizes better.  */
2453           for (j = 0; j < group_size; ++j)
2454             if (!matches[j])
2455               break;
2456           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2457             {
2458               if (dump_enabled_p ())
2459                 dump_printf_loc (MSG_NOTE, vect_location,
2460                                  "Building vector operands from scalars\n");
2461               this_tree_size++;
2462               child = vect_create_new_slp_node (oprnd_info->ops);
2463               children.safe_push (child);
2464               oprnd_info->ops = vNULL;
2465               continue;
2466             }
2467         }
2468
2469       gcc_assert (child == NULL);
2470       FOR_EACH_VEC_ELT (children, j, child)
2471         if (child)
2472           vect_free_slp_tree (child);
2473       vect_free_oprnd_info (oprnds_info);
2474       return NULL;
2475     }
2476
2477   vect_free_oprnd_info (oprnds_info);
2478
2479   /* If we have all children of a child built up from uniform scalars
2480      or does more than one possibly expensive vector construction then
2481      just throw that away, causing it built up from scalars.
2482      The exception is the SLP node for the vector store.  */
2483   if (is_a <bb_vec_info> (vinfo)
2484       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2485       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2486          do extra work to cancel the pattern so the uses see the
2487          scalar version.  */
2488       && !is_pattern_stmt_p (stmt_info))
2489     {
2490       slp_tree child;
2491       unsigned j;
2492       bool all_uniform_p = true;
2493       unsigned n_vector_builds = 0;
2494       FOR_EACH_VEC_ELT (children, j, child)
2495         {
2496           if (!child)
2497             ;
2498           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2499             all_uniform_p = false;
2500           else if (!vect_slp_tree_uniform_p (child))
2501             {
2502               all_uniform_p = false;
2503               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2504                 n_vector_builds++;
2505             }
2506         }
2507       if (all_uniform_p
2508           || n_vector_builds > 1
2509           || (n_vector_builds == children.length ()
2510               && is_a <gphi *> (stmt_info->stmt)))
2511         {
2512           /* Roll back.  */
2513           matches[0] = false;
2514           FOR_EACH_VEC_ELT (children, j, child)
2515             if (child)
2516               vect_free_slp_tree (child);
2517
2518           if (dump_enabled_p ())
2519             dump_printf_loc (MSG_NOTE, vect_location,
2520                              "Building parent vector operands from "
2521                              "scalars instead\n");
2522           return NULL;
2523         }
2524     }
2525
2526   *tree_size += this_tree_size + 1;
2527   *max_nunits = this_max_nunits;
2528
2529   if (two_operators)
2530     {
2531       /* ???  We'd likely want to either cache in bst_map sth like
2532          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2533          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2534          explicit stmts to put in so the keying on 'stmts' doesn't
2535          work (but we have the same issue with nodes that use 'ops').  */
2536       slp_tree one = new _slp_tree;
2537       slp_tree two = new _slp_tree;
2538       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2539       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2540       SLP_TREE_VECTYPE (one) = vectype;
2541       SLP_TREE_VECTYPE (two) = vectype;
2542       SLP_TREE_CHILDREN (one).safe_splice (children);
2543       SLP_TREE_CHILDREN (two).safe_splice (children);
2544       slp_tree child;
2545       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2546         SLP_TREE_REF_COUNT (child)++;
2547
2548       /* Here we record the original defs since this
2549          node represents the final lane configuration.  */
2550       node = vect_create_new_slp_node (node, stmts, 2);
2551       SLP_TREE_VECTYPE (node) = vectype;
2552       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2553       SLP_TREE_CHILDREN (node).quick_push (one);
2554       SLP_TREE_CHILDREN (node).quick_push (two);
2555       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2556       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2557       enum tree_code ocode = ERROR_MARK;
2558       stmt_vec_info ostmt_info;
2559       unsigned j = 0;
2560       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2561         {
2562           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2563           if (gimple_assign_rhs_code (ostmt) != code0)
2564             {
2565               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2566               ocode = gimple_assign_rhs_code (ostmt);
2567               j = i;
2568             }
2569           else
2570             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2571         }
2572       SLP_TREE_CODE (one) = code0;
2573       SLP_TREE_CODE (two) = ocode;
2574       SLP_TREE_LANES (one) = stmts.length ();
2575       SLP_TREE_LANES (two) = stmts.length ();
2576       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2577       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2578       return node;
2579     }
2580
2581   node = vect_create_new_slp_node (node, stmts, nops);
2582   SLP_TREE_VECTYPE (node) = vectype;
2583   SLP_TREE_CHILDREN (node).splice (children);
2584   return node;
2585 }
2586
2587 /* Dump a single SLP tree NODE.  */
2588
2589 static void
2590 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2591                      slp_tree node)
2592 {
2593   unsigned i, j;
2594   slp_tree child;
2595   stmt_vec_info stmt_info;
2596   tree op;
2597
2598   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2599   dump_user_location_t user_loc = loc.get_user_location ();
2600   dump_printf_loc (metadata, user_loc,
2601                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2602                    ", refcnt=%u)",
2603                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2604                    ? " (external)"
2605                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2606                       ? " (constant)"
2607                       : ""), (void *) node,
2608                    estimated_poly_value (node->max_nunits),
2609                                          SLP_TREE_REF_COUNT (node));
2610   if (SLP_TREE_VECTYPE (node))
2611     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2612   dump_printf (metadata, "\n");
2613   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2614     {
2615       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2616         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2617       else
2618         dump_printf_loc (metadata, user_loc, "op template: %G",
2619                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2620     }
2621   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2622     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2623       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2624   else
2625     {
2626       dump_printf_loc (metadata, user_loc, "\t{ ");
2627       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2628         dump_printf (metadata, "%T%s ", op,
2629                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2630       dump_printf (metadata, "}\n");
2631     }
2632   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2633     {
2634       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2635       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2636         dump_printf (dump_kind, " %u", j);
2637       dump_printf (dump_kind, " }\n");
2638     }
2639   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2640     {
2641       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2642       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2643         dump_printf (dump_kind, " %u[%u]",
2644                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2645                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2646       dump_printf (dump_kind, " }\n");
2647     }
2648   if (SLP_TREE_CHILDREN (node).is_empty ())
2649     return;
2650   dump_printf_loc (metadata, user_loc, "\tchildren");
2651   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2652     dump_printf (dump_kind, " %p", (void *)child);
2653   dump_printf (dump_kind, "\n");
2654 }
2655
2656 DEBUG_FUNCTION void
2657 debug (slp_tree node)
2658 {
2659   debug_dump_context ctx;
2660   vect_print_slp_tree (MSG_NOTE,
2661                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2662                        node);
2663 }
2664
2665 /* Recursive helper for the dot producer below.  */
2666
2667 static void
2668 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2669 {
2670   if (visited.add (node))
2671     return;
2672
2673   fprintf (f, "\"%p\" [label=\"", (void *)node);
2674   vect_print_slp_tree (MSG_NOTE,
2675                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2676                        node);
2677   fprintf (f, "\"];\n");
2678
2679
2680   for (slp_tree child : SLP_TREE_CHILDREN (node))
2681     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2682
2683   for (slp_tree child : SLP_TREE_CHILDREN (node))
2684     if (child)
2685       dot_slp_tree (f, child, visited);
2686 }
2687
2688 DEBUG_FUNCTION void
2689 dot_slp_tree (const char *fname, slp_tree node)
2690 {
2691   FILE *f = fopen (fname, "w");
2692   fprintf (f, "digraph {\n");
2693   fflush (f);
2694     {
2695       debug_dump_context ctx (f);
2696       hash_set<slp_tree> visited;
2697       dot_slp_tree (f, node, visited);
2698     }
2699   fflush (f);
2700   fprintf (f, "}\n");
2701   fclose (f);
2702 }
2703
2704 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2705
2706 static void
2707 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2708                       slp_tree node, hash_set<slp_tree> &visited)
2709 {
2710   unsigned i;
2711   slp_tree child;
2712
2713   if (visited.add (node))
2714     return;
2715
2716   vect_print_slp_tree (dump_kind, loc, node);
2717
2718   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2719     if (child)
2720       vect_print_slp_graph (dump_kind, loc, child, visited);
2721 }
2722
2723 static void
2724 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2725                       slp_tree entry)
2726 {
2727   hash_set<slp_tree> visited;
2728   vect_print_slp_graph (dump_kind, loc, entry, visited);
2729 }
2730
2731 /* Mark the tree rooted at NODE with PURE_SLP.  */
2732
2733 static void
2734 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2735 {
2736   int i;
2737   stmt_vec_info stmt_info;
2738   slp_tree child;
2739
2740   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2741     return;
2742
2743   if (visited.add (node))
2744     return;
2745
2746   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2747     STMT_SLP_TYPE (stmt_info) = pure_slp;
2748
2749   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750     if (child)
2751       vect_mark_slp_stmts (child, visited);
2752 }
2753
2754 static void
2755 vect_mark_slp_stmts (slp_tree node)
2756 {
2757   hash_set<slp_tree> visited;
2758   vect_mark_slp_stmts (node, visited);
2759 }
2760
2761 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2762
2763 static void
2764 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2765 {
2766   int i;
2767   stmt_vec_info stmt_info;
2768   slp_tree child;
2769
2770   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2771     return;
2772
2773   if (visited.add (node))
2774     return;
2775
2776   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2777     {
2778       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2779                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2780       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2781     }
2782
2783   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2784     if (child)
2785       vect_mark_slp_stmts_relevant (child, visited);
2786 }
2787
2788 static void
2789 vect_mark_slp_stmts_relevant (slp_tree node)
2790 {
2791   hash_set<slp_tree> visited;
2792   vect_mark_slp_stmts_relevant (node, visited);
2793 }
2794
2795
2796 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2797
2798 static void
2799 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2800                        hash_set<slp_tree> &visited)
2801 {
2802   if (!node || visited.add (node))
2803     return;
2804
2805   if (SLP_TREE_CHILDREN (node).length () == 0)
2806     {
2807       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2808         return;
2809       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2810       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2811           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2812         loads.safe_push (node);
2813     }
2814   else
2815     {
2816       unsigned i;
2817       slp_tree child;
2818       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2819         vect_gather_slp_loads (loads, child, visited);
2820     }
2821 }
2822
2823
2824 /* Find the last store in SLP INSTANCE.  */
2825
2826 stmt_vec_info
2827 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2828 {
2829   stmt_vec_info last = NULL;
2830   stmt_vec_info stmt_vinfo;
2831
2832   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2833     {
2834       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2835       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2836     }
2837
2838   return last;
2839 }
2840
2841 /* Find the first stmt in NODE.  */
2842
2843 stmt_vec_info
2844 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2845 {
2846   stmt_vec_info first = NULL;
2847   stmt_vec_info stmt_vinfo;
2848
2849   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2850     {
2851       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2852       if (!first
2853           || get_later_stmt (stmt_vinfo, first) == first)
2854         first = stmt_vinfo;
2855     }
2856
2857   return first;
2858 }
2859
2860 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2861    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2862    (also containing the first GROUP1_SIZE stmts, since stores are
2863    consecutive), the second containing the remainder.
2864    Return the first stmt in the second group.  */
2865
2866 static stmt_vec_info
2867 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2868 {
2869   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2870   gcc_assert (group1_size > 0);
2871   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2872   gcc_assert (group2_size > 0);
2873   DR_GROUP_SIZE (first_vinfo) = group1_size;
2874
2875   stmt_vec_info stmt_info = first_vinfo;
2876   for (unsigned i = group1_size; i > 1; i--)
2877     {
2878       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2879       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2880     }
2881   /* STMT is now the last element of the first group.  */
2882   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2883   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2884
2885   DR_GROUP_SIZE (group2) = group2_size;
2886   for (stmt_info = group2; stmt_info;
2887        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2888     {
2889       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2890       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2891     }
2892
2893   /* For the second group, the DR_GROUP_GAP is that before the original group,
2894      plus skipping over the first vector.  */
2895   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2896
2897   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2898   DR_GROUP_GAP (first_vinfo) += group2_size;
2899
2900   if (dump_enabled_p ())
2901     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2902                      group1_size, group2_size);
2903
2904   return group2;
2905 }
2906
2907 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2908    statements and a vector of NUNITS elements.  */
2909
2910 static poly_uint64
2911 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2912 {
2913   return exact_div (common_multiple (nunits, group_size), group_size);
2914 }
2915
2916 /* Helper that checks to see if a node is a load node.  */
2917
2918 static inline bool
2919 vect_is_slp_load_node  (slp_tree root)
2920 {
2921   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2922          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2923          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2924 }
2925
2926
2927 /* Helper function of optimize_load_redistribution that performs the operation
2928    recursively.  */
2929
2930 static slp_tree
2931 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2932                                 vec_info *vinfo, unsigned int group_size,
2933                                 hash_map<slp_tree, slp_tree> *load_map,
2934                                 slp_tree root)
2935 {
2936   if (slp_tree *leader = load_map->get (root))
2937     return *leader;
2938
2939   slp_tree node;
2940   unsigned i;
2941
2942   /* For now, we don't know anything about externals so do not do anything.  */
2943   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2944     return NULL;
2945   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2946     {
2947       /* First convert this node into a load node and add it to the leaves
2948          list and flatten the permute from a lane to a load one.  If it's
2949          unneeded it will be elided later.  */
2950       vec<stmt_vec_info> stmts;
2951       stmts.create (SLP_TREE_LANES (root));
2952       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2953       for (unsigned j = 0; j < lane_perm.length (); j++)
2954         {
2955           std::pair<unsigned, unsigned> perm = lane_perm[j];
2956           node = SLP_TREE_CHILDREN (root)[perm.first];
2957
2958           if (!vect_is_slp_load_node (node)
2959               || SLP_TREE_CHILDREN (node).exists ())
2960             {
2961               stmts.release ();
2962               goto next;
2963             }
2964
2965           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2966         }
2967
2968       if (dump_enabled_p ())
2969         dump_printf_loc (MSG_NOTE, vect_location,
2970                          "converting stmts on permute node %p\n",
2971                          (void *) root);
2972
2973       bool *matches = XALLOCAVEC (bool, group_size);
2974       poly_uint64 max_nunits = 1;
2975       unsigned tree_size = 0, limit = 1;
2976       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2977                                   matches, &limit, &tree_size, bst_map);
2978       if (!node)
2979         stmts.release ();
2980
2981       load_map->put (root, node);
2982       return node;
2983     }
2984
2985 next:
2986   load_map->put (root, NULL);
2987
2988   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2989     {
2990       slp_tree value
2991         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2992                                           node);
2993       if (value)
2994         {
2995           SLP_TREE_REF_COUNT (value)++;
2996           SLP_TREE_CHILDREN (root)[i] = value;
2997           /* ???  We know the original leafs of the replaced nodes will
2998              be referenced by bst_map, only the permutes created by
2999              pattern matching are not.  */
3000           if (SLP_TREE_REF_COUNT (node) == 1)
3001             load_map->remove (node);
3002           vect_free_slp_tree (node);
3003         }
3004     }
3005
3006   return NULL;
3007 }
3008
3009 /* Temporary workaround for loads not being CSEd during SLP build.  This
3010    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3011    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3012    same DR such that the final operation is equal to a permuted load.  Such
3013    NODES are then directly converted into LOADS themselves.  The nodes are
3014    CSEd using BST_MAP.  */
3015
3016 static void
3017 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3018                               vec_info *vinfo, unsigned int group_size,
3019                               hash_map<slp_tree, slp_tree> *load_map,
3020                               slp_tree root)
3021 {
3022   slp_tree node;
3023   unsigned i;
3024
3025   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3026     {
3027       slp_tree value
3028         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3029                                           node);
3030       if (value)
3031         {
3032           SLP_TREE_REF_COUNT (value)++;
3033           SLP_TREE_CHILDREN (root)[i] = value;
3034           /* ???  We know the original leafs of the replaced nodes will
3035              be referenced by bst_map, only the permutes created by
3036              pattern matching are not.  */
3037           if (SLP_TREE_REF_COUNT (node) == 1)
3038             load_map->remove (node);
3039           vect_free_slp_tree (node);
3040         }
3041     }
3042 }
3043
3044 /* Helper function of vect_match_slp_patterns.
3045
3046    Attempts to match patterns against the slp tree rooted in REF_NODE using
3047    VINFO.  Patterns are matched in post-order traversal.
3048
3049    If matching is successful the value in REF_NODE is updated and returned, if
3050    not then it is returned unchanged.  */
3051
3052 static bool
3053 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3054                            slp_tree_to_load_perm_map_t *perm_cache,
3055                            slp_compat_nodes_map_t *compat_cache,
3056                            hash_set<slp_tree> *visited)
3057 {
3058   unsigned i;
3059   slp_tree node = *ref_node;
3060   bool found_p = false;
3061   if (!node || visited->add (node))
3062     return false;
3063
3064   slp_tree child;
3065   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3066     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3067                                           vinfo, perm_cache, compat_cache,
3068                                           visited);
3069
3070   for (unsigned x = 0; x < num__slp_patterns; x++)
3071     {
3072       vect_pattern *pattern
3073         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3074       if (pattern)
3075         {
3076           pattern->build (vinfo);
3077           delete pattern;
3078           found_p = true;
3079         }
3080     }
3081
3082   return found_p;
3083 }
3084
3085 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3086    vec_info VINFO.
3087
3088    The modified tree is returned.  Patterns are tried in order and multiple
3089    patterns may match.  */
3090
3091 static bool
3092 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3093                          hash_set<slp_tree> *visited,
3094                          slp_tree_to_load_perm_map_t *perm_cache,
3095                          slp_compat_nodes_map_t *compat_cache)
3096 {
3097   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3098   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3099
3100   if (dump_enabled_p ())
3101     dump_printf_loc (MSG_NOTE, vect_location,
3102                      "Analyzing SLP tree %p for patterns\n",
3103                      (void *) SLP_INSTANCE_TREE (instance));
3104
3105   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3106                                     visited);
3107 }
3108
3109 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3110    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3111    Return true if we could use IFN_STORE_LANES instead and if that appears
3112    to be the better approach.  */
3113
3114 static bool
3115 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3116                                unsigned int group_size,
3117                                unsigned int new_group_size)
3118 {
3119   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3120   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3121   if (!vectype)
3122     return false;
3123   /* Allow the split if one of the two new groups would operate on full
3124      vectors *within* rather than across one scalar loop iteration.
3125      This is purely a heuristic, but it should work well for group
3126      sizes of 3 and 4, where the possible splits are:
3127
3128        3->2+1:  OK if the vector has exactly two elements
3129        4->2+2:  Likewise
3130        4->3+1:  Less clear-cut.  */
3131   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3132       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3133     return false;
3134   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3135 }
3136
3137 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3138    vect_build_slp_tree to build a tree of packed stmts if possible.
3139    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3140
3141 static bool
3142 vect_analyze_slp_instance (vec_info *vinfo,
3143                            scalar_stmts_to_slp_tree_map_t *bst_map,
3144                            stmt_vec_info stmt_info, slp_instance_kind kind,
3145                            unsigned max_tree_size, unsigned *limit);
3146
3147 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3148    of KIND.  Return true if successful.  */
3149
3150 static bool
3151 vect_build_slp_instance (vec_info *vinfo,
3152                          slp_instance_kind kind,
3153                          vec<stmt_vec_info> &scalar_stmts,
3154                          vec<stmt_vec_info> &root_stmt_infos,
3155                          vec<tree> &remain,
3156                          unsigned max_tree_size, unsigned *limit,
3157                          scalar_stmts_to_slp_tree_map_t *bst_map,
3158                          /* ???  We need stmt_info for group splitting.  */
3159                          stmt_vec_info stmt_info_)
3160 {
3161   if (kind == slp_inst_kind_ctor)
3162     {
3163       if (dump_enabled_p ())
3164         dump_printf_loc (MSG_NOTE, vect_location,
3165                          "Analyzing vectorizable constructor: %G\n",
3166                          root_stmt_infos[0]->stmt);
3167     }
3168
3169   if (dump_enabled_p ())
3170     {
3171       dump_printf_loc (MSG_NOTE, vect_location,
3172                        "Starting SLP discovery for\n");
3173       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3174         dump_printf_loc (MSG_NOTE, vect_location,
3175                          "  %G", scalar_stmts[i]->stmt);
3176     }
3177
3178   /* When a BB reduction doesn't have an even number of lanes
3179      strip it down, treating the remaining lane as scalar.
3180      ???  Selecting the optimal set of lanes to vectorize would be nice
3181      but SLP build for all lanes will fail quickly because we think
3182      we're going to need unrolling.  */
3183   if (kind == slp_inst_kind_bb_reduc
3184       && (scalar_stmts.length () & 1))
3185     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3186
3187   /* Build the tree for the SLP instance.  */
3188   unsigned int group_size = scalar_stmts.length ();
3189   bool *matches = XALLOCAVEC (bool, group_size);
3190   poly_uint64 max_nunits = 1;
3191   unsigned tree_size = 0;
3192   unsigned i;
3193   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3194                                        &max_nunits, matches, limit,
3195                                        &tree_size, bst_map);
3196   if (node != NULL)
3197     {
3198       /* Calculate the unrolling factor based on the smallest type.  */
3199       poly_uint64 unrolling_factor
3200         = calculate_unrolling_factor (max_nunits, group_size);
3201
3202       if (maybe_ne (unrolling_factor, 1U)
3203           && is_a <bb_vec_info> (vinfo))
3204         {
3205           unsigned HOST_WIDE_INT const_max_nunits;
3206           if (!max_nunits.is_constant (&const_max_nunits)
3207               || const_max_nunits > group_size)
3208             {
3209               if (dump_enabled_p ())
3210                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3211                                  "Build SLP failed: store group "
3212                                  "size not a multiple of the vector size "
3213                                  "in basic block SLP\n");
3214               vect_free_slp_tree (node);
3215               return false;
3216             }
3217           /* Fatal mismatch.  */
3218           if (dump_enabled_p ())
3219             dump_printf_loc (MSG_NOTE, vect_location,
3220                              "SLP discovery succeeded but node needs "
3221                              "splitting\n");
3222           memset (matches, true, group_size);
3223           matches[group_size / const_max_nunits * const_max_nunits] = false;
3224           vect_free_slp_tree (node);
3225         }
3226       else
3227         {
3228           /* Create a new SLP instance.  */
3229           slp_instance new_instance = XNEW (class _slp_instance);
3230           SLP_INSTANCE_TREE (new_instance) = node;
3231           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3232           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3233           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3234           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3235           SLP_INSTANCE_KIND (new_instance) = kind;
3236           new_instance->reduc_phis = NULL;
3237           new_instance->cost_vec = vNULL;
3238           new_instance->subgraph_entries = vNULL;
3239
3240           if (dump_enabled_p ())
3241             dump_printf_loc (MSG_NOTE, vect_location,
3242                              "SLP size %u vs. limit %u.\n",
3243                              tree_size, max_tree_size);
3244
3245           /* Fixup SLP reduction chains.  */
3246           if (kind == slp_inst_kind_reduc_chain)
3247             {
3248               /* If this is a reduction chain with a conversion in front
3249                  amend the SLP tree with a node for that.  */
3250               gimple *scalar_def
3251                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3252               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3253                 {
3254                   /* Get at the conversion stmt - we know it's the single use
3255                      of the last stmt of the reduction chain.  */
3256                   use_operand_p use_p;
3257                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3258                                            &use_p, &scalar_def);
3259                   gcc_assert (r);
3260                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3261                   next_info = vect_stmt_to_vectorize (next_info);
3262                   scalar_stmts = vNULL;
3263                   scalar_stmts.create (group_size);
3264                   for (unsigned i = 0; i < group_size; ++i)
3265                     scalar_stmts.quick_push (next_info);
3266                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3267                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3268                   SLP_TREE_CHILDREN (conv).quick_push (node);
3269                   SLP_INSTANCE_TREE (new_instance) = conv;
3270                   /* We also have to fake this conversion stmt as SLP reduction
3271                      group so we don't have to mess with too much code
3272                      elsewhere.  */
3273                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3274                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3275                 }
3276               /* Fill the backedge child of the PHI SLP node.  The
3277                  general matching code cannot find it because the
3278                  scalar code does not reflect how we vectorize the
3279                  reduction.  */
3280               use_operand_p use_p;
3281               imm_use_iterator imm_iter;
3282               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3283               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3284                                      gimple_get_lhs (scalar_def))
3285                 /* There are exactly two non-debug uses, the reduction
3286                    PHI and the loop-closed PHI node.  */
3287                 if (!is_gimple_debug (USE_STMT (use_p))
3288                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3289                   {
3290                     auto_vec<stmt_vec_info, 64> phis (group_size);
3291                     stmt_vec_info phi_info
3292                       = vinfo->lookup_stmt (USE_STMT (use_p));
3293                     for (unsigned i = 0; i < group_size; ++i)
3294                       phis.quick_push (phi_info);
3295                     slp_tree *phi_node = bst_map->get (phis);
3296                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3297                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3298                       = SLP_INSTANCE_TREE (new_instance);
3299                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3300                   }
3301             }
3302
3303           vinfo->slp_instances.safe_push (new_instance);
3304
3305           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3306              the number of scalar stmts in the root in a few places.
3307              Verify that assumption holds.  */
3308           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3309                         .length () == group_size);
3310
3311           if (dump_enabled_p ())
3312             {
3313               dump_printf_loc (MSG_NOTE, vect_location,
3314                                "Final SLP tree for instance %p:\n",
3315                                (void *) new_instance);
3316               vect_print_slp_graph (MSG_NOTE, vect_location,
3317                                     SLP_INSTANCE_TREE (new_instance));
3318             }
3319
3320           return true;
3321         }
3322     }
3323   else
3324     {
3325       /* Failed to SLP.  */
3326       /* Free the allocated memory.  */
3327       scalar_stmts.release ();
3328     }
3329
3330   stmt_vec_info stmt_info = stmt_info_;
3331   /* Try to break the group up into pieces.  */
3332   if (kind == slp_inst_kind_store)
3333     {
3334       /* ???  We could delay all the actual splitting of store-groups
3335          until after SLP discovery of the original group completed.
3336          Then we can recurse to vect_build_slp_instance directly.  */
3337       for (i = 0; i < group_size; i++)
3338         if (!matches[i])
3339           break;
3340
3341       /* For basic block SLP, try to break the group up into multiples of
3342          a vector size.  */
3343       if (is_a <bb_vec_info> (vinfo)
3344           && (i > 1 && i < group_size))
3345         {
3346           tree scalar_type
3347             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3348           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3349                                                       1 << floor_log2 (i));
3350           unsigned HOST_WIDE_INT const_nunits;
3351           if (vectype
3352               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3353             {
3354               /* Split into two groups at the first vector boundary.  */
3355               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3356               unsigned group1_size = i & ~(const_nunits - 1);
3357
3358               if (dump_enabled_p ())
3359                 dump_printf_loc (MSG_NOTE, vect_location,
3360                                  "Splitting SLP group at stmt %u\n", i);
3361               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3362                                                                group1_size);
3363               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3364                                                     kind, max_tree_size,
3365                                                     limit);
3366               /* Split the rest at the failure point and possibly
3367                  re-analyze the remaining matching part if it has
3368                  at least two lanes.  */
3369               if (group1_size < i
3370                   && (i + 1 < group_size
3371                       || i - group1_size > 1))
3372                 {
3373                   stmt_vec_info rest2 = rest;
3374                   rest = vect_split_slp_store_group (rest, i - group1_size);
3375                   if (i - group1_size > 1)
3376                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3377                                                       kind, max_tree_size,
3378                                                       limit);
3379                 }
3380               /* Re-analyze the non-matching tail if it has at least
3381                  two lanes.  */
3382               if (i + 1 < group_size)
3383                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3384                                                   rest, kind, max_tree_size,
3385                                                   limit);
3386               return res;
3387             }
3388         }
3389
3390       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3391       if (is_a <loop_vec_info> (vinfo)
3392           && (i > 1 && i < group_size)
3393           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3394         {
3395           unsigned group1_size = i;
3396
3397           if (dump_enabled_p ())
3398             dump_printf_loc (MSG_NOTE, vect_location,
3399                              "Splitting SLP group at stmt %u\n", i);
3400
3401           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3402                                                            group1_size);
3403           /* Loop vectorization cannot handle gaps in stores, make sure
3404              the split group appears as strided.  */
3405           STMT_VINFO_STRIDED_P (rest) = 1;
3406           DR_GROUP_GAP (rest) = 0;
3407           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3408           DR_GROUP_GAP (stmt_info) = 0;
3409
3410           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3411                                                 kind, max_tree_size, limit);
3412           if (i + 1 < group_size)
3413             res |= vect_analyze_slp_instance (vinfo, bst_map,
3414                                               rest, kind, max_tree_size, limit);
3415
3416           return res;
3417         }
3418
3419       /* Even though the first vector did not all match, we might be able to SLP
3420          (some) of the remainder.  FORNOW ignore this possibility.  */
3421     }
3422
3423   /* Failed to SLP.  */
3424   if (dump_enabled_p ())
3425     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3426   return false;
3427 }
3428
3429
3430 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3431    vect_build_slp_tree to build a tree of packed stmts if possible.
3432    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3433
3434 static bool
3435 vect_analyze_slp_instance (vec_info *vinfo,
3436                            scalar_stmts_to_slp_tree_map_t *bst_map,
3437                            stmt_vec_info stmt_info,
3438                            slp_instance_kind kind,
3439                            unsigned max_tree_size, unsigned *limit)
3440 {
3441   unsigned int i;
3442   vec<stmt_vec_info> scalar_stmts;
3443
3444   if (is_a <bb_vec_info> (vinfo))
3445     vect_location = stmt_info->stmt;
3446
3447   stmt_vec_info next_info = stmt_info;
3448   if (kind == slp_inst_kind_store)
3449     {
3450       /* Collect the stores and store them in scalar_stmts.  */
3451       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3452       while (next_info)
3453         {
3454           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3455           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3456         }
3457     }
3458   else if (kind == slp_inst_kind_reduc_chain)
3459     {
3460       /* Collect the reduction stmts and store them in scalar_stmts.  */
3461       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3462       while (next_info)
3463         {
3464           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3465           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3466         }
3467       /* Mark the first element of the reduction chain as reduction to properly
3468          transform the node.  In the reduction analysis phase only the last
3469          element of the chain is marked as reduction.  */
3470       STMT_VINFO_DEF_TYPE (stmt_info)
3471         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3472       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3473         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3474     }
3475   else if (kind == slp_inst_kind_reduc_group)
3476     {
3477       /* Collect reduction statements.  */
3478       const vec<stmt_vec_info> &reductions
3479         = as_a <loop_vec_info> (vinfo)->reductions;
3480       scalar_stmts.create (reductions.length ());
3481       for (i = 0; reductions.iterate (i, &next_info); i++)
3482         if ((STMT_VINFO_RELEVANT_P (next_info)
3483              || STMT_VINFO_LIVE_P (next_info))
3484             /* ???  Make sure we didn't skip a conversion around a reduction
3485                path.  In that case we'd have to reverse engineer that conversion
3486                stmt following the chain using reduc_idx and from the PHI
3487                using reduc_def.  */
3488             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3489           scalar_stmts.quick_push (next_info);
3490       /* If less than two were relevant/live there's nothing to SLP.  */
3491       if (scalar_stmts.length () < 2)
3492         return false;
3493     }
3494   else
3495     gcc_unreachable ();
3496
3497   vec<stmt_vec_info> roots = vNULL;
3498   vec<tree> remain = vNULL;
3499   /* Build the tree for the SLP instance.  */
3500   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3501                                       roots, remain,
3502                                       max_tree_size, limit, bst_map,
3503                                       kind == slp_inst_kind_store
3504                                       ? stmt_info : NULL);
3505
3506   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3507      where we should do store group splitting.  */
3508
3509   return res;
3510 }
3511
3512 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3513    trees of packed scalar stmts if SLP is possible.  */
3514
3515 opt_result
3516 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3517 {
3518   unsigned int i;
3519   stmt_vec_info first_element;
3520   slp_instance instance;
3521
3522   DUMP_VECT_SCOPE ("vect_analyze_slp");
3523
3524   unsigned limit = max_tree_size;
3525
3526   scalar_stmts_to_slp_tree_map_t *bst_map
3527     = new scalar_stmts_to_slp_tree_map_t ();
3528
3529   /* Find SLP sequences starting from groups of grouped stores.  */
3530   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3531     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3532                                slp_inst_kind_store, max_tree_size, &limit);
3533
3534   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3535     {
3536       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3537         {
3538           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3539           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3540                                        bb_vinfo->roots[i].stmts,
3541                                        bb_vinfo->roots[i].roots,
3542                                        bb_vinfo->roots[i].remain,
3543                                        max_tree_size, &limit, bst_map, NULL))
3544             {
3545               bb_vinfo->roots[i].stmts = vNULL;
3546               bb_vinfo->roots[i].roots = vNULL;
3547               bb_vinfo->roots[i].remain = vNULL;
3548             }
3549         }
3550     }
3551
3552   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3553     {
3554       /* Find SLP sequences starting from reduction chains.  */
3555       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3556         if (! STMT_VINFO_RELEVANT_P (first_element)
3557             && ! STMT_VINFO_LIVE_P (first_element))
3558           ;
3559         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3560                                               slp_inst_kind_reduc_chain,
3561                                               max_tree_size, &limit))
3562           {
3563             /* Dissolve reduction chain group.  */
3564             stmt_vec_info vinfo = first_element;
3565             stmt_vec_info last = NULL;
3566             while (vinfo)
3567               {
3568                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3569                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3570                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3571                 last = vinfo;
3572                 vinfo = next;
3573               }
3574             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3575             /* It can be still vectorized as part of an SLP reduction.  */
3576             loop_vinfo->reductions.safe_push (last);
3577           }
3578
3579       /* Find SLP sequences starting from groups of reductions.  */
3580       if (loop_vinfo->reductions.length () > 1)
3581         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3582                                    slp_inst_kind_reduc_group, max_tree_size,
3583                                    &limit);
3584     }
3585
3586   hash_set<slp_tree> visited_patterns;
3587   slp_tree_to_load_perm_map_t perm_cache;
3588   slp_compat_nodes_map_t compat_cache;
3589
3590   /* See if any patterns can be found in the SLP tree.  */
3591   bool pattern_found = false;
3592   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3593     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3594                                               &visited_patterns, &perm_cache,
3595                                               &compat_cache);
3596
3597   /* If any were found optimize permutations of loads.  */
3598   if (pattern_found)
3599     {
3600       hash_map<slp_tree, slp_tree> load_map;
3601       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3602         {
3603           slp_tree root = SLP_INSTANCE_TREE (instance);
3604           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3605                                         &load_map, root);
3606         }
3607     }
3608
3609
3610
3611   /* The map keeps a reference on SLP nodes built, release that.  */
3612   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3613        it != bst_map->end (); ++it)
3614     if ((*it).second)
3615       vect_free_slp_tree ((*it).second);
3616   delete bst_map;
3617
3618   if (pattern_found && dump_enabled_p ())
3619     {
3620       dump_printf_loc (MSG_NOTE, vect_location,
3621                        "Pattern matched SLP tree\n");
3622       hash_set<slp_tree> visited;
3623       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3624         vect_print_slp_graph (MSG_NOTE, vect_location,
3625                               SLP_INSTANCE_TREE (instance), visited);
3626     }
3627
3628   return opt_result::success ();
3629 }
3630
3631 /* Estimates the cost of inserting layout changes into the SLP graph.
3632    It can also say that the insertion is impossible.  */
3633
3634 struct slpg_layout_cost
3635 {
3636   slpg_layout_cost () = default;
3637   slpg_layout_cost (sreal, bool);
3638
3639   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3640   bool is_possible () const { return depth != sreal::max (); }
3641
3642   bool operator== (const slpg_layout_cost &) const;
3643   bool operator!= (const slpg_layout_cost &) const;
3644
3645   bool is_better_than (const slpg_layout_cost &, bool) const;
3646
3647   void add_parallel_cost (const slpg_layout_cost &);
3648   void add_serial_cost (const slpg_layout_cost &);
3649   void split (unsigned int);
3650
3651   /* The longest sequence of layout changes needed during any traversal
3652      of the partition dag, weighted by execution frequency.
3653
3654      This is the most important metric when optimizing for speed, since
3655      it helps to ensure that we keep the number of operations on
3656      critical paths to a minimum.  */
3657   sreal depth = 0;
3658
3659   /* An estimate of the total number of operations needed.  It is weighted by
3660      execution frequency when optimizing for speed but not when optimizing for
3661      size.  In order to avoid double-counting, a node with a fanout of N will
3662      distribute 1/N of its total cost to each successor.
3663
3664      This is the most important metric when optimizing for size, since
3665      it helps to keep the total number of operations to a minimum,  */
3666   sreal total = 0;
3667 };
3668
3669 /* Construct costs for a node with weight WEIGHT.  A higher weight
3670    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3671    optimizing for size rather than speed.  */
3672
3673 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3674   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3675 {
3676 }
3677
3678 bool
3679 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3680 {
3681   return depth == other.depth && total == other.total;
3682 }
3683
3684 bool
3685 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3686 {
3687   return !operator== (other);
3688 }
3689
3690 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3691    true if we are optimizing for size rather than speed.  */
3692
3693 bool
3694 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3695                                   bool is_for_size) const
3696 {
3697   if (is_for_size)
3698     {
3699       if (total != other.total)
3700         return total < other.total;
3701       return depth < other.depth;
3702     }
3703   else
3704     {
3705       if (depth != other.depth)
3706         return depth < other.depth;
3707       return total < other.total;
3708     }
3709 }
3710
3711 /* Increase the costs to account for something with cost INPUT_COST
3712    happening in parallel with the current costs.  */
3713
3714 void
3715 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3716 {
3717   depth = std::max (depth, input_cost.depth);
3718   total += input_cost.total;
3719 }
3720
3721 /* Increase the costs to account for something with cost INPUT_COST
3722    happening in series with the current costs.  */
3723
3724 void
3725 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3726 {
3727   depth += other.depth;
3728   total += other.total;
3729 }
3730
3731 /* Split the total cost among TIMES successors or predecessors.  */
3732
3733 void
3734 slpg_layout_cost::split (unsigned int times)
3735 {
3736   if (times > 1)
3737     total /= times;
3738 }
3739
3740 /* Information about one node in the SLP graph, for use during
3741    vect_optimize_slp_pass.  */
3742
3743 struct slpg_vertex
3744 {
3745   slpg_vertex (slp_tree node_) : node (node_) {}
3746
3747   /* The node itself.  */
3748   slp_tree node;
3749
3750   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3751      partitions are flexible; they can have whichever layout consumers
3752      want them to have.  */
3753   int partition = -1;
3754
3755   /* The number of nodes that directly use the result of this one
3756      (i.e. the number of nodes that count this one as a child).  */
3757   unsigned int out_degree = 0;
3758
3759   /* The execution frequency of the node.  */
3760   sreal weight = 0;
3761
3762   /* The total execution frequency of all nodes that directly use the
3763      result of this one.  */
3764   sreal out_weight = 0;
3765 };
3766
3767 /* Information about one partition of the SLP graph, for use during
3768    vect_optimize_slp_pass.  */
3769
3770 struct slpg_partition_info
3771 {
3772   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3773      of m_partitioned_nodes.  */
3774   unsigned int node_begin = 0;
3775   unsigned int node_end = 0;
3776
3777   /* Which layout we've chosen to use for this partition, or -1 if
3778      we haven't picked one yet.  */
3779   int layout = -1;
3780
3781   /* The number of predecessors and successors in the partition dag.
3782      The predecessors always have lower partition numbers and the
3783      successors always have higher partition numbers.
3784
3785      Note that the directions of these edges are not necessarily the
3786      same as in the data flow graph.  For example, if an SCC has separate
3787      partitions for an inner loop and an outer loop, the inner loop's
3788      partition will have at least two incoming edges from the outer loop's
3789      partition: one for a live-in value and one for a live-out value.
3790      In data flow terms, one of these edges would also be from the outer loop
3791      to the inner loop, but the other would be in the opposite direction.  */
3792   unsigned int in_degree = 0;
3793   unsigned int out_degree = 0;
3794 };
3795
3796 /* Information about the costs of using a particular layout for a
3797    particular partition.  It can also say that the combination is
3798    impossible.  */
3799
3800 struct slpg_partition_layout_costs
3801 {
3802   bool is_possible () const { return internal_cost.is_possible (); }
3803   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3804
3805   /* The costs inherited from predecessor partitions.  */
3806   slpg_layout_cost in_cost;
3807
3808   /* The inherent cost of the layout within the node itself.  For example,
3809      this is nonzero for a load if choosing a particular layout would require
3810      the load to permute the loaded elements.  It is nonzero for a
3811      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3812      to full-vector moves.  */
3813   slpg_layout_cost internal_cost;
3814
3815   /* The costs inherited from successor partitions.  */
3816   slpg_layout_cost out_cost;
3817 };
3818
3819 /* This class tries to optimize the layout of vectors in order to avoid
3820    unnecessary shuffling.  At the moment, the set of possible layouts are
3821    restricted to bijective permutations.
3822
3823    The goal of the pass depends on whether we're optimizing for size or
3824    for speed.  When optimizing for size, the goal is to reduce the overall
3825    number of layout changes (including layout changes implied by things
3826    like load permutations).  When optimizing for speed, the goal is to
3827    reduce the maximum latency attributable to layout changes on any
3828    non-cyclical path through the data flow graph.
3829
3830    For example, when optimizing a loop nest for speed, we will prefer
3831    to make layout changes outside of a loop rather than inside of a loop,
3832    and will prefer to make layout changes in parallel rather than serially,
3833    even if that increases the overall number of layout changes.
3834
3835    The high-level procedure is:
3836
3837    (1) Build a graph in which edges go from uses (parents) to definitions
3838        (children).
3839
3840    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3841
3842    (3) When optimizing for speed, partition the nodes in each SCC based
3843        on their containing cfg loop.  When optimizing for size, treat
3844        each SCC as a single partition.
3845
3846        This gives us a dag of partitions.  The goal is now to assign a
3847        layout to each partition.
3848
3849    (4) Construct a set of vector layouts that are worth considering.
3850        Record which nodes must keep their current layout.
3851
3852    (5) Perform a forward walk over the partition dag (from loads to stores)
3853        accumulating the "forward" cost of using each layout.  When visiting
3854        each partition, assign a tentative choice of layout to the partition
3855        and use that choice when calculating the cost of using a different
3856        layout in successor partitions.
3857
3858    (6) Perform a backward walk over the partition dag (from stores to loads),
3859        accumulating the "backward" cost of using each layout.  When visiting
3860        each partition, make a final choice of layout for that partition based
3861        on the accumulated forward costs (from (5)) and backward costs
3862        (from (6)).
3863
3864    (7) Apply the chosen layouts to the SLP graph.
3865
3866    For example, consider the SLP statements:
3867
3868    S1:      a_1 = load
3869        loop:
3870    S2:      a_2 = PHI<a_1, a_3>
3871    S3:      b_1 = load
3872    S4:      a_3 = a_2 + b_1
3873        exit:
3874    S5:      a_4 = PHI<a_3>
3875    S6:      store a_4
3876
3877    S2 and S4 form an SCC and are part of the same loop.  Every other
3878    statement is in a singleton SCC.  In this example there is a one-to-one
3879    mapping between SCCs and partitions and the partition dag looks like this;
3880
3881         S1     S3
3882          \     /
3883           S2+S4
3884             |
3885            S5
3886             |
3887            S6
3888
3889    S2, S3 and S4 will have a higher execution frequency than the other
3890    statements, so when optimizing for speed, the goal is to avoid any
3891    layout changes:
3892
3893    - within S3
3894    - within S2+S4
3895    - on the S3->S2+S4 edge
3896
3897    For example, if S3 was originally a reversing load, the goal of the
3898    pass is to make it an unreversed load and change the layout on the
3899    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3900    on S1->S2+S4 and S5->S6 would also be acceptable.)
3901
3902    The difference between SCCs and partitions becomes important if we
3903    add an outer loop:
3904
3905    S1:      a_1 = ...
3906        loop1:
3907    S2:      a_2 = PHI<a_1, a_6>
3908    S3:      b_1 = load
3909    S4:      a_3 = a_2 + b_1
3910        loop2:
3911    S5:      a_4 = PHI<a_3, a_5>
3912    S6:      c_1 = load
3913    S7:      a_5 = a_4 + c_1
3914        exit2:
3915    S8:      a_6 = PHI<a_5>
3916    S9:      store a_6
3917        exit1:
3918
3919    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3920    for speed, we usually do not want restrictions in the outer loop to "infect"
3921    the decision for the inner loop.  For example, if an outer-loop node
3922    in the SCC contains a statement with a fixed layout, that should not
3923    prevent the inner loop from using a different layout.  Conversely,
3924    the inner loop should not dictate a layout to the outer loop: if the
3925    outer loop does a lot of computation, then it may not be efficient to
3926    do all of that computation in the inner loop's preferred layout.
3927
3928    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3929    and S5+S7 (inner).  We also try to arrange partitions so that:
3930
3931    - the partition for an outer loop comes before the partition for
3932      an inner loop
3933
3934    - if a sibling loop A dominates a sibling loop B, A's partition
3935      comes before B's
3936
3937    This gives the following partition dag for the example above:
3938
3939         S1        S3
3940          \        /
3941           S2+S4+S8   S6
3942            |   \\    /
3943            |    S5+S7
3944            |
3945           S9
3946
3947    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3948    one for a reversal of the edge S7->S8.
3949
3950    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3951    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3952    preferred layout against the cost of changing the layout on entry to the
3953    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3954
3955    Although this works well when optimizing for speed, it has the downside
3956    when optimizing for size that the choice of layout for S5+S7 is completely
3957    independent of S9, which lessens the chance of reducing the overall number
3958    of permutations.  We therefore do not partition SCCs when optimizing
3959    for size.
3960
3961    To give a concrete example of the difference between optimizing
3962    for size and speed, consider:
3963
3964    a[0] = (b[1] << c[3]) - d[1];
3965    a[1] = (b[0] << c[2]) - d[0];
3966    a[2] = (b[3] << c[1]) - d[3];
3967    a[3] = (b[2] << c[0]) - d[2];
3968
3969    There are three different layouts here: one for a, one for b and d,
3970    and one for c.  When optimizing for speed it is better to permute each
3971    of b, c and d into the order required by a, since those permutations
3972    happen in parallel.  But when optimizing for size, it is better to:
3973
3974    - permute c into the same order as b
3975    - do the arithmetic
3976    - permute the result into the order required by a
3977
3978    This gives 2 permutations rather than 3.  */
3979
3980 class vect_optimize_slp_pass
3981 {
3982 public:
3983   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3984   void run ();
3985
3986 private:
3987   /* Graph building.  */
3988   struct loop *containing_loop (slp_tree);
3989   bool is_cfg_latch_edge (graph_edge *);
3990   void build_vertices (hash_set<slp_tree> &, slp_tree);
3991   void build_vertices ();
3992   void build_graph ();
3993
3994   /* Partitioning.  */
3995   void create_partitions ();
3996   template<typename T> void for_each_partition_edge (unsigned int, T);
3997
3998   /* Layout selection.  */
3999   bool is_compatible_layout (slp_tree, unsigned int);
4000   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4001   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4002                                                        unsigned int);
4003   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4004                                int, unsigned int);
4005   int internal_node_cost (slp_tree, int, unsigned int);
4006   void start_choosing_layouts ();
4007
4008   /* Cost propagation.  */
4009   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4010                                      unsigned int, unsigned int);
4011   slpg_layout_cost total_in_cost (unsigned int);
4012   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4013   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4014   void forward_pass ();
4015   void backward_pass ();
4016
4017   /* Rematerialization.  */
4018   slp_tree get_result_with_layout (slp_tree, unsigned int);
4019   void materialize ();
4020
4021   /* Clean-up.  */
4022   void remove_redundant_permutations ();
4023
4024   void dump ();
4025
4026   vec_info *m_vinfo;
4027
4028   /* True if we should optimize the graph for size, false if we should
4029      optimize it for speed.  (It wouldn't be easy to make this decision
4030      more locally.)  */
4031   bool m_optimize_size;
4032
4033   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4034      In other words, a node's predecessors are its slp_tree parents and
4035      a node's successors are its slp_tree children.  */
4036   graph *m_slpg = nullptr;
4037
4038   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4039   auto_vec<slpg_vertex> m_vertices;
4040
4041   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4042      and loads.  */
4043   auto_vec<int> m_leafs;
4044
4045   /* This array has one entry for every vector layout that we're considering.
4046      Element 0 is null and indicates "no change".  Other entries describe
4047      permutations that are inherent in the current graph and that we would
4048      like to reverse if possible.
4049
4050      For example, a permutation { 1, 2, 3, 0 } means that something has
4051      effectively been permuted in that way, such as a load group
4052      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4053      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4054      in order to put things "back" in order.  */
4055   auto_vec<vec<unsigned> > m_perms;
4056
4057   /* A partitioning of the nodes for which a layout must be chosen.
4058      Each partition represents an <SCC, cfg loop> pair; that is,
4059      nodes in different SCCs belong to different partitions, and nodes
4060      within an SCC can be further partitioned according to a containing
4061      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4062
4063      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4064        from leaves (such as loads) to roots (such as stores).
4065
4066      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4067   auto_vec<slpg_partition_info> m_partitions;
4068
4069   /* The list of all nodes for which a layout must be chosen.  Nodes for
4070      partition P come before the nodes for partition P+1.  Nodes within a
4071      partition are in reverse postorder.  */
4072   auto_vec<unsigned int> m_partitioned_nodes;
4073
4074   /* Index P * num-layouts + L contains the cost of using layout L
4075      for partition P.  */
4076   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4077
4078   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4079      original output of node N adjusted to have layout L.  */
4080   auto_vec<slp_tree> m_node_layouts;
4081 };
4082
4083 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4084    Also record whether we should optimize anything for speed rather
4085    than size.  */
4086
4087 void
4088 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4089                                         slp_tree node)
4090 {
4091   unsigned i;
4092   slp_tree child;
4093
4094   if (visited.add (node))
4095     return;
4096
4097   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4098     {
4099       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4100       if (optimize_bb_for_speed_p (bb))
4101         m_optimize_size = false;
4102     }
4103
4104   node->vertex = m_vertices.length ();
4105   m_vertices.safe_push (slpg_vertex (node));
4106
4107   bool leaf = true;
4108   bool force_leaf = false;
4109   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4110     if (child)
4111       {
4112         leaf = false;
4113         build_vertices (visited, child);
4114       }
4115     else
4116       force_leaf = true;
4117   /* Since SLP discovery works along use-def edges all cycles have an
4118      entry - but there's the exception of cycles where we do not handle
4119      the entry explicitely (but with a NULL SLP node), like some reductions
4120      and inductions.  Force those SLP PHIs to act as leafs to make them
4121      backwards reachable.  */
4122   if (leaf || force_leaf)
4123     m_leafs.safe_push (node->vertex);
4124 }
4125
4126 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4127
4128 void
4129 vect_optimize_slp_pass::build_vertices ()
4130 {
4131   hash_set<slp_tree> visited;
4132   unsigned i;
4133   slp_instance instance;
4134   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4135     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4136 }
4137
4138 /* Apply (reverse) bijectite PERM to VEC.  */
4139
4140 template <class T>
4141 static void
4142 vect_slp_permute (vec<unsigned> perm,
4143                   vec<T> &vec, bool reverse)
4144 {
4145   auto_vec<T, 64> saved;
4146   saved.create (vec.length ());
4147   for (unsigned i = 0; i < vec.length (); ++i)
4148     saved.quick_push (vec[i]);
4149
4150   if (reverse)
4151     {
4152       for (unsigned i = 0; i < vec.length (); ++i)
4153         vec[perm[i]] = saved[i];
4154       for (unsigned i = 0; i < vec.length (); ++i)
4155         gcc_assert (vec[perm[i]] == saved[i]);
4156     }
4157   else
4158     {
4159       for (unsigned i = 0; i < vec.length (); ++i)
4160         vec[i] = saved[perm[i]];
4161       for (unsigned i = 0; i < vec.length (); ++i)
4162         gcc_assert (vec[i] == saved[perm[i]]);
4163     }
4164 }
4165
4166 /* Return the cfg loop that contains NODE.  */
4167
4168 struct loop *
4169 vect_optimize_slp_pass::containing_loop (slp_tree node)
4170 {
4171   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4172   if (!rep)
4173     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4174   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4175 }
4176
4177 /* Return true if UD (an edge from a use to a definition) is associated
4178    with a loop latch edge in the cfg.  */
4179
4180 bool
4181 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4182 {
4183   slp_tree use = m_vertices[ud->src].node;
4184   slp_tree def = m_vertices[ud->dest].node;
4185   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4186       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4187     return false;
4188
4189   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4190   return (is_a<gphi *> (use_rep->stmt)
4191           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4192           && containing_loop (def) == containing_loop (use));
4193 }
4194
4195 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4196    a nonnull data field.  */
4197
4198 void
4199 vect_optimize_slp_pass::build_graph ()
4200 {
4201   m_optimize_size = true;
4202   build_vertices ();
4203
4204   m_slpg = new_graph (m_vertices.length ());
4205   for (slpg_vertex &v : m_vertices)
4206     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4207       if (child)
4208         {
4209           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4210           if (is_cfg_latch_edge (ud))
4211             ud->data = this;
4212         }
4213 }
4214
4215 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4216
4217 static bool
4218 skip_cfg_latch_edges (graph_edge *e)
4219 {
4220   return e->data;
4221 }
4222
4223 /* Create the node partitions.  */
4224
4225 void
4226 vect_optimize_slp_pass::create_partitions ()
4227 {
4228   /* Calculate a postorder of the graph, ignoring edges that correspond
4229      to natural latch edges in the cfg.  Reading the vector from the end
4230      to the beginning gives the reverse postorder.  */
4231   auto_vec<int> initial_rpo;
4232   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4233                false, NULL, skip_cfg_latch_edges);
4234   gcc_assert (initial_rpo.length () == m_vertices.length ());
4235
4236   /* Calculate the strongly connected components of the graph.  */
4237   auto_vec<int> scc_grouping;
4238   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4239
4240   /* Create a new index order in which all nodes from the same SCC are
4241      consecutive.  Use scc_pos to record the index of the first node in
4242      each SCC.  */
4243   auto_vec<unsigned int> scc_pos (num_sccs);
4244   int last_component = -1;
4245   unsigned int node_count = 0;
4246   for (unsigned int node_i : scc_grouping)
4247     {
4248       if (last_component != m_slpg->vertices[node_i].component)
4249         {
4250           last_component = m_slpg->vertices[node_i].component;
4251           gcc_assert (last_component == int (scc_pos.length ()));
4252           scc_pos.quick_push (node_count);
4253         }
4254       node_count += 1;
4255     }
4256   gcc_assert (node_count == initial_rpo.length ()
4257               && last_component + 1 == int (num_sccs));
4258
4259   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4260      inside each SCC following the RPO we calculated above.  The fact that
4261      we ignored natural latch edges when calculating the RPO should ensure
4262      that, for natural loop nests:
4263
4264      - the first node that we encounter in a cfg loop is the loop header phi
4265      - the loop header phis are in dominance order
4266
4267      Arranging for this is an optimization (see below) rather than a
4268      correctness issue.  Unnatural loops with a tangled mess of backedges
4269      will still work correctly, but might give poorer results.
4270
4271      Also update scc_pos so that it gives 1 + the index of the last node
4272      in the SCC.  */
4273   m_partitioned_nodes.safe_grow (node_count);
4274   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4275     {
4276       unsigned int node_i = initial_rpo[old_i];
4277       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4278       m_partitioned_nodes[new_i] = node_i;
4279     }
4280
4281   /* When optimizing for speed, partition each SCC based on the containing
4282      cfg loop. The order we constructed above should ensure that, for natural
4283      cfg loops, we'll create sub-SCC partitions for outer loops before
4284      the corresponding sub-SCC partitions for inner loops.  Similarly,
4285      when one sibling loop A dominates another sibling loop B, we should
4286      create a sub-SCC partition for A before a sub-SCC partition for B.
4287
4288      As above, nothing depends for correctness on whether this achieves
4289      a natural nesting, but we should get better results when it does.  */
4290   m_partitions.reserve (m_vertices.length ());
4291   unsigned int next_partition_i = 0;
4292   hash_map<struct loop *, int> loop_partitions;
4293   unsigned int rpo_begin = 0;
4294   unsigned int num_partitioned_nodes = 0;
4295   for (unsigned int rpo_end : scc_pos)
4296     {
4297       loop_partitions.empty ();
4298       unsigned int partition_i = next_partition_i;
4299       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4300         {
4301           /* Handle externals and constants optimistically throughout.
4302              But treat existing vectors as fixed since we do not handle
4303              permuting them.  */
4304           unsigned int node_i = m_partitioned_nodes[rpo_i];
4305           auto &vertex = m_vertices[node_i];
4306           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4307                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4308               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4309             vertex.partition = -1;
4310           else
4311             {
4312               bool existed;
4313               if (m_optimize_size)
4314                 existed = next_partition_i > partition_i;
4315               else
4316                 {
4317                   struct loop *loop = containing_loop (vertex.node);
4318                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4319                   if (!existed)
4320                     entry = next_partition_i;
4321                   partition_i = entry;
4322                 }
4323               if (!existed)
4324                 {
4325                   m_partitions.quick_push (slpg_partition_info ());
4326                   next_partition_i += 1;
4327                 }
4328               vertex.partition = partition_i;
4329               num_partitioned_nodes += 1;
4330               m_partitions[partition_i].node_end += 1;
4331             }
4332         }
4333       rpo_begin = rpo_end;
4334     }
4335
4336   /* Assign ranges of consecutive node indices to each partition,
4337      in partition order.  Start with node_end being the same as
4338      node_begin so that the next loop can use it as a counter.  */
4339   unsigned int node_begin = 0;
4340   for (auto &partition : m_partitions)
4341     {
4342       partition.node_begin = node_begin;
4343       node_begin += partition.node_end;
4344       partition.node_end = partition.node_begin;
4345     }
4346   gcc_assert (node_begin == num_partitioned_nodes);
4347
4348   /* Finally build the list of nodes in partition order.  */
4349   m_partitioned_nodes.truncate (num_partitioned_nodes);
4350   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4351     {
4352       int partition_i = m_vertices[node_i].partition;
4353       if (partition_i >= 0)
4354         {
4355           unsigned int order_i = m_partitions[partition_i].node_end++;
4356           m_partitioned_nodes[order_i] = node_i;
4357         }
4358     }
4359 }
4360
4361 /* Look for edges from earlier partitions into node NODE_I and edges from
4362    node NODE_I into later partitions.  Call:
4363
4364       FN (ud, other_node_i)
4365
4366    for each such use-to-def edge ud, where other_node_i is the node at the
4367    other end of the edge.  */
4368
4369 template<typename T>
4370 void
4371 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4372 {
4373   int partition_i = m_vertices[node_i].partition;
4374   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4375        pred; pred = pred->pred_next)
4376     {
4377       int src_partition_i = m_vertices[pred->src].partition;
4378       if (src_partition_i >= 0 && src_partition_i != partition_i)
4379         fn (pred, pred->src);
4380     }
4381   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4382        succ; succ = succ->succ_next)
4383     {
4384       int dest_partition_i = m_vertices[succ->dest].partition;
4385       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4386         fn (succ, succ->dest);
4387     }
4388 }
4389
4390 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4391    that NODE would operate on.  This test is independent of NODE's actual
4392    operation.  */
4393
4394 bool
4395 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4396                                               unsigned int layout_i)
4397 {
4398   if (layout_i == 0)
4399     return true;
4400
4401   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4402     return false;
4403
4404   return true;
4405 }
4406
4407 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4408    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4409    layouts is incompatible with NODE or if the change is not possible for
4410    some other reason.
4411
4412    The properties taken from NODE include the number of lanes and the
4413    vector type.  The actual operation doesn't matter.  */
4414
4415 int
4416 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4417                                             unsigned int from_layout_i,
4418                                             unsigned int to_layout_i)
4419 {
4420   if (!is_compatible_layout (node, from_layout_i)
4421       || !is_compatible_layout (node, to_layout_i))
4422     return -1;
4423
4424   if (from_layout_i == to_layout_i)
4425     return 0;
4426
4427   auto_vec<slp_tree, 1> children (1);
4428   children.quick_push (node);
4429   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4430   if (from_layout_i > 0)
4431     for (unsigned int i : m_perms[from_layout_i])
4432       perm.quick_push ({ 0, i });
4433   else
4434     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4435       perm.quick_push ({ 0, i });
4436   if (to_layout_i > 0)
4437     vect_slp_permute (m_perms[to_layout_i], perm, true);
4438   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4439                                                children, false);
4440   if (count >= 0)
4441     return MAX (count, 1);
4442
4443   /* ??? In principle we could try changing via layout 0, giving two
4444      layout changes rather than 1.  Doing that would require
4445      corresponding support in get_result_with_layout.  */
4446   return -1;
4447 }
4448
4449 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4450
4451 inline slpg_partition_layout_costs &
4452 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4453                                                 unsigned int layout_i)
4454 {
4455   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4456 }
4457
4458 /* Change PERM in one of two ways:
4459
4460    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4461      chosen for child I of NODE.
4462
4463    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4464
4465    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4466
4467 void
4468 vect_optimize_slp_pass::
4469 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4470                         int in_layout_i, unsigned int out_layout_i)
4471 {
4472   for (auto &entry : perm)
4473     {
4474       int this_in_layout_i = in_layout_i;
4475       if (this_in_layout_i < 0)
4476         {
4477           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4478           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4479           this_in_layout_i = m_partitions[in_partition_i].layout;
4480         }
4481       if (this_in_layout_i > 0)
4482         entry.second = m_perms[this_in_layout_i][entry.second];
4483     }
4484   if (out_layout_i > 0)
4485     vect_slp_permute (m_perms[out_layout_i], perm, true);
4486 }
4487
4488 /* Check whether the target allows NODE to be rearranged so that the node's
4489    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4490    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4491
4492    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4493    NODE can adapt to the layout changes that have (perhaps provisionally)
4494    been chosen for NODE's children, so that no extra permutations are
4495    needed on either the input or the output of NODE.
4496
4497    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4498    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4499
4500    IN_LAYOUT_I has no meaning for other types of node.
4501
4502    Keeping the node as-is is always valid.  If the target doesn't appear
4503    to support the node as-is, but might realistically support other layouts,
4504    then layout 0 instead has the cost of a worst-case permutation.  On the
4505    one hand, this ensures that every node has at least one valid layout,
4506    avoiding what would otherwise be an awkward special case.  On the other,
4507    it still encourages the pass to change an invalid pre-existing layout
4508    choice into a valid one.  */
4509
4510 int
4511 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4512                                             unsigned int out_layout_i)
4513 {
4514   const int fallback_cost = 1;
4515
4516   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4517     {
4518       auto_lane_permutation_t tmp_perm;
4519       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4520
4521       /* Check that the child nodes support the chosen layout.  Checking
4522          the first child is enough, since any second child would have the
4523          same shape.  */
4524       auto first_child = SLP_TREE_CHILDREN (node)[0];
4525       if (in_layout_i > 0
4526           && !is_compatible_layout (first_child, in_layout_i))
4527         return -1;
4528
4529       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4530       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4531                                                   node, tmp_perm,
4532                                                   SLP_TREE_CHILDREN (node),
4533                                                   false);
4534       if (count < 0)
4535         {
4536           if (in_layout_i == 0 && out_layout_i == 0)
4537             {
4538               /* Use the fallback cost if the node could in principle support
4539                  some nonzero layout for both the inputs and the outputs.
4540                  Otherwise assume that the node will be rejected later
4541                  and rebuilt from scalars.  */
4542               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4543                 return fallback_cost;
4544               return 0;
4545             }
4546           return -1;
4547         }
4548
4549       /* We currently have no way of telling whether the new layout is cheaper
4550          or more expensive than the old one.  But at least in principle,
4551          it should be worth making zero permutations (whole-vector shuffles)
4552          cheaper than real permutations, in case the pass is able to remove
4553          the latter.  */
4554       return count == 0 ? 0 : 1;
4555     }
4556
4557   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4558   if (rep
4559       && STMT_VINFO_DATA_REF (rep)
4560       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4561       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4562     {
4563       auto_load_permutation_t tmp_perm;
4564       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4565       if (out_layout_i > 0)
4566         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4567
4568       poly_uint64 vf = 1;
4569       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4570         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4571       unsigned int n_perms;
4572       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4573                                            nullptr, vf, true, false, &n_perms))
4574         {
4575           auto rep = SLP_TREE_REPRESENTATIVE (node);
4576           if (out_layout_i == 0)
4577             {
4578               /* Use the fallback cost if the load is an N-to-N permutation.
4579                  Otherwise assume that the node will be rejected later
4580                  and rebuilt from scalars.  */
4581               if (STMT_VINFO_GROUPED_ACCESS (rep)
4582                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4583                       == SLP_TREE_LANES (node)))
4584                 return fallback_cost;
4585               return 0;
4586             }
4587           return -1;
4588         }
4589
4590       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4591       return n_perms == 0 ? 0 : 1;
4592     }
4593
4594   return 0;
4595 }
4596
4597 /* Decide which element layouts we should consider using.  Calculate the
4598    weights associated with inserting layout changes on partition edges.
4599    Also mark partitions that cannot change layout, by setting their
4600    layout to zero.  */
4601
4602 void
4603 vect_optimize_slp_pass::start_choosing_layouts ()
4604 {
4605   /* Used to assign unique permutation indices.  */
4606   using perm_hash = unbounded_hashmap_traits<
4607     vec_free_hash_base<int_hash_base<unsigned>>,
4608     int_hash<int, -1, -2>
4609   >;
4610   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4611
4612   /* Layout 0 is "no change".  */
4613   m_perms.safe_push (vNULL);
4614
4615   /* Create layouts from existing permutations.  */
4616   auto_load_permutation_t tmp_perm;
4617   for (unsigned int node_i : m_partitioned_nodes)
4618     {
4619       /* Leafs also double as entries to the reverse graph.  Allow the
4620          layout of those to be changed.  */
4621       auto &vertex = m_vertices[node_i];
4622       auto &partition = m_partitions[vertex.partition];
4623       if (!m_slpg->vertices[node_i].succ)
4624         partition.layout = 0;
4625
4626       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4627       slp_tree node = vertex.node;
4628       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4629       slp_tree child;
4630       unsigned HOST_WIDE_INT imin, imax = 0;
4631       bool any_permute = false;
4632       tmp_perm.truncate (0);
4633       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4634         {
4635           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4636              unpermuted, record a layout that reverses this permutation.
4637
4638              We would need more work to cope with loads that are internally
4639              permuted and also have inputs (such as masks for
4640              IFN_MASK_LOADs).  */
4641           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4642           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4643             {
4644               partition.layout = -1;
4645               continue;
4646             }
4647           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4648           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4649           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4650         }
4651       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4652                && SLP_TREE_CHILDREN (node).length () == 1
4653                && (child = SLP_TREE_CHILDREN (node)[0])
4654                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4655                    .is_constant (&imin)))
4656         {
4657           /* If the child has the same vector size as this node,
4658              reversing the permutation can make the permutation a no-op.
4659              In other cases it can change a true permutation into a
4660              full-vector extract.  */
4661           tmp_perm.reserve (SLP_TREE_LANES (node));
4662           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4663             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4664         }
4665       else
4666         continue;
4667
4668       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4669         {
4670           unsigned idx = tmp_perm[j];
4671           imin = MIN (imin, idx);
4672           imax = MAX (imax, idx);
4673           if (idx - tmp_perm[0] != j)
4674             any_permute = true;
4675         }
4676       /* If the span doesn't match we'd disrupt VF computation, avoid
4677          that for now.  */
4678       if (imax - imin + 1 != SLP_TREE_LANES (node))
4679         continue;
4680       /* If there's no permute no need to split one out.  In this case
4681          we can consider turning a load into a permuted load, if that
4682          turns out to be cheaper than alternatives.  */
4683       if (!any_permute)
4684         {
4685           partition.layout = -1;
4686           continue;
4687         }
4688
4689       /* For now only handle true permutes, like
4690          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4691          when permuting constants and invariants keeping the permute
4692          bijective.  */
4693       auto_sbitmap load_index (SLP_TREE_LANES (node));
4694       bitmap_clear (load_index);
4695       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4696         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4697       unsigned j;
4698       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4699         if (!bitmap_bit_p (load_index, j))
4700           break;
4701       if (j != SLP_TREE_LANES (node))
4702         continue;
4703
4704       vec<unsigned> perm = vNULL;
4705       perm.safe_grow (SLP_TREE_LANES (node), true);
4706       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4707         perm[j] = tmp_perm[j] - imin;
4708
4709       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4710         {
4711           /* Continue to use existing layouts, but don't add any more.  */
4712           int *entry = layout_ids.get (perm);
4713           partition.layout = entry ? *entry : 0;
4714           perm.release ();
4715         }
4716       else
4717         {
4718           bool existed;
4719           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4720           if (existed)
4721             perm.release ();
4722           else
4723             {
4724               layout_i = m_perms.length ();
4725               m_perms.safe_push (perm);
4726             }
4727           partition.layout = layout_i;
4728         }
4729     }
4730
4731   /* Initially assume that every layout is possible and has zero cost
4732      in every partition.  */
4733   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4734                                               * m_perms.length ());
4735
4736   /* We have to mark outgoing permutations facing non-associating-reduction
4737      graph entries that are not represented as to be materialized.
4738      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4739   for (slp_instance instance : m_vinfo->slp_instances)
4740     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4741       {
4742         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4743         m_partitions[m_vertices[node_i].partition].layout = 0;
4744       }
4745     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4746       {
4747         stmt_vec_info stmt_info
4748           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4749         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4750         if (needs_fold_left_reduction_p (TREE_TYPE
4751                                            (gimple_get_lhs (stmt_info->stmt)),
4752                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4753           {
4754             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4755             m_partitions[m_vertices[node_i].partition].layout = 0;
4756           }
4757       }
4758
4759   /* Check which layouts each node and partition can handle.  Calculate the
4760      weights associated with inserting layout changes on edges.  */
4761   for (unsigned int node_i : m_partitioned_nodes)
4762     {
4763       auto &vertex = m_vertices[node_i];
4764       auto &partition = m_partitions[vertex.partition];
4765       slp_tree node = vertex.node;
4766
4767       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4768         {
4769           vertex.weight = vect_slp_node_weight (node);
4770
4771           /* We do not handle stores with a permutation, so all
4772              incoming permutations must have been materialized.
4773
4774              We also don't handle masked grouped loads, which lack a
4775              permutation vector.  In this case the memory locations
4776              form an implicit second input to the loads, on top of the
4777              explicit mask input, and the memory input's layout cannot
4778              be changed.
4779
4780              On the other hand, we do support permuting gather loads and
4781              masked gather loads, where each scalar load is independent
4782              of the others.  This can be useful if the address/index input
4783              benefits from permutation.  */
4784           if (STMT_VINFO_DATA_REF (rep)
4785               && STMT_VINFO_GROUPED_ACCESS (rep)
4786               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4787             partition.layout = 0;
4788
4789           /* We cannot change the layout of an operation that is
4790              not independent on lanes.  Note this is an explicit
4791              negative list since that's much shorter than the respective
4792              positive one but it's critical to keep maintaining it.  */
4793           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4794             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4795               {
4796               case CFN_COMPLEX_ADD_ROT90:
4797               case CFN_COMPLEX_ADD_ROT270:
4798               case CFN_COMPLEX_MUL:
4799               case CFN_COMPLEX_MUL_CONJ:
4800               case CFN_VEC_ADDSUB:
4801               case CFN_VEC_FMADDSUB:
4802               case CFN_VEC_FMSUBADD:
4803                 partition.layout = 0;
4804               default:;
4805               }
4806         }
4807
4808       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4809         {
4810           auto &other_vertex = m_vertices[other_node_i];
4811
4812           /* Count the number of edges from earlier partitions and the number
4813              of edges to later partitions.  */
4814           if (other_vertex.partition < vertex.partition)
4815             partition.in_degree += 1;
4816           else
4817             partition.out_degree += 1;
4818
4819           /* If the current node uses the result of OTHER_NODE_I, accumulate
4820              the effects of that.  */
4821           if (ud->src == int (node_i))
4822             {
4823               other_vertex.out_weight += vertex.weight;
4824               other_vertex.out_degree += 1;
4825             }
4826         };
4827       for_each_partition_edge (node_i, process_edge);
4828     }
4829 }
4830
4831 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4832    its current (provisional) choice of layout.  The inputs do not necessarily
4833    have the same layout as each other.  */
4834
4835 slpg_layout_cost
4836 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4837 {
4838   auto &vertex = m_vertices[node_i];
4839   slpg_layout_cost cost;
4840   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4841     {
4842       auto &other_vertex = m_vertices[other_node_i];
4843       if (other_vertex.partition < vertex.partition)
4844         {
4845           auto &other_partition = m_partitions[other_vertex.partition];
4846           auto &other_costs = partition_layout_costs (other_vertex.partition,
4847                                                       other_partition.layout);
4848           slpg_layout_cost this_cost = other_costs.in_cost;
4849           this_cost.add_serial_cost (other_costs.internal_cost);
4850           this_cost.split (other_partition.out_degree);
4851           cost.add_parallel_cost (this_cost);
4852         }
4853     };
4854   for_each_partition_edge (node_i, add_cost);
4855   return cost;
4856 }
4857
4858 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4859    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4860    slpg_layout_cost::impossible () if the change isn't possible.  */
4861
4862 slpg_layout_cost
4863 vect_optimize_slp_pass::
4864 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4865                   unsigned int layout2_i)
4866 {
4867   auto &def_vertex = m_vertices[ud->dest];
4868   auto &use_vertex = m_vertices[ud->src];
4869   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4870   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4871   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4872                                     use_layout_i);
4873   if (factor < 0)
4874     return slpg_layout_cost::impossible ();
4875
4876   /* We have a choice of putting the layout change at the site of the
4877      definition or at the site of the use.  Prefer the former when
4878      optimizing for size or when the execution frequency of the
4879      definition is no greater than the combined execution frequencies of
4880      the uses.  When putting the layout change at the site of the definition,
4881      divvy up the cost among all consumers.  */
4882   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4883     {
4884       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4885       cost.split (def_vertex.out_degree);
4886       return cost;
4887     }
4888   return { use_vertex.weight * factor, m_optimize_size };
4889 }
4890
4891 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4892    partition; FROM_NODE_I could be the definition node or the use node.
4893    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4894    Return the cost of any necessary fix-ups on edge UD, or return
4895    slpg_layout_cost::impossible () if the change isn't possible.
4896
4897    At this point, FROM_NODE_I's partition has chosen the cheapest
4898    layout based on the information available so far, but this choice
4899    is only provisional.  */
4900
4901 slpg_layout_cost
4902 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4903                                       unsigned int to_layout_i)
4904 {
4905   auto &from_vertex = m_vertices[from_node_i];
4906   unsigned int from_partition_i = from_vertex.partition;
4907   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4908   gcc_assert (from_partition.layout >= 0);
4909
4910   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4911      with its current layout preference.  */
4912   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4913   auto edge_cost = edge_layout_cost (ud, from_node_i,
4914                                      from_partition.layout, to_layout_i);
4915   if (edge_cost.is_possible ())
4916     {
4917       auto &from_costs = partition_layout_costs (from_partition_i,
4918                                                  from_partition.layout);
4919       cost = from_costs.in_cost;
4920       cost.add_serial_cost (from_costs.internal_cost);
4921       cost.split (from_partition.out_degree);
4922       cost.add_serial_cost (edge_cost);
4923     }
4924
4925   /* Take the minimum of that cost and the cost that applies if
4926      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4927   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4928                                                       to_layout_i);
4929   if (direct_layout_costs.is_possible ())
4930     {
4931       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4932       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4933       direct_cost.split (from_partition.out_degree);
4934       if (!cost.is_possible ()
4935           || direct_cost.is_better_than (cost, m_optimize_size))
4936         cost = direct_cost;
4937     }
4938
4939   return cost;
4940 }
4941
4942 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4943    partition; TO_NODE_I could be the definition node or the use node.
4944    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4945    return the cost of any necessary fix-ups on edge UD, or
4946    slpg_layout_cost::impossible () if the choice cannot be made.
4947
4948    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4949
4950 slpg_layout_cost
4951 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4952                                        unsigned int from_layout_i)
4953 {
4954   auto &to_vertex = m_vertices[to_node_i];
4955   unsigned int to_partition_i = to_vertex.partition;
4956   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4957   gcc_assert (to_partition.layout >= 0);
4958
4959   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4960      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4961      any other inputs keep their current choice of layout.  */
4962   auto &to_costs = partition_layout_costs (to_partition_i,
4963                                            to_partition.layout);
4964   if (ud->src == int (to_node_i)
4965       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4966     {
4967       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4968       auto old_layout = from_partition.layout;
4969       from_partition.layout = from_layout_i;
4970       int factor = internal_node_cost (to_vertex.node, -1,
4971                                        to_partition.layout);
4972       from_partition.layout = old_layout;
4973       if (factor >= 0)
4974         {
4975           slpg_layout_cost cost = to_costs.out_cost;
4976           cost.add_serial_cost ({ to_vertex.weight * factor,
4977                                   m_optimize_size });
4978           cost.split (to_partition.in_degree);
4979           return cost;
4980         }
4981     }
4982
4983   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4984   auto edge_cost = edge_layout_cost (ud, to_node_i,
4985                                      to_partition.layout, from_layout_i);
4986   if (edge_cost.is_possible ())
4987     {
4988       slpg_layout_cost cost = to_costs.out_cost;
4989       cost.add_serial_cost (to_costs.internal_cost);
4990       cost.split (to_partition.in_degree);
4991       cost.add_serial_cost (edge_cost);
4992       return cost;
4993     }
4994
4995   return slpg_layout_cost::impossible ();
4996 }
4997
4998 /* Make a forward pass through the partitions, accumulating input costs.
4999    Make a tentative (provisional) choice of layout for each partition,
5000    ensuring that this choice still allows later partitions to keep
5001    their original layout.  */
5002
5003 void
5004 vect_optimize_slp_pass::forward_pass ()
5005 {
5006   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5007        ++partition_i)
5008     {
5009       auto &partition = m_partitions[partition_i];
5010
5011       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5012          the incoming cost that would apply if every predecessor partition
5013          keeps its current layout.  This is used within the loop below.  */
5014       slpg_layout_cost in_cost;
5015       slp_tree single_node = nullptr;
5016       if (partition.node_end == partition.node_begin + 1)
5017         {
5018           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5019           single_node = m_vertices[node_i].node;
5020           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5021             in_cost = total_in_cost (node_i);
5022         }
5023
5024       /* Go through the possible layouts.  Decide which ones are valid
5025          for this partition and record which of the valid layouts has
5026          the lowest cost.  */
5027       unsigned int min_layout_i = 0;
5028       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5029       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5030         {
5031           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5032           if (!layout_costs.is_possible ())
5033             continue;
5034
5035           /* If the recorded layout is already 0 then the layout cannot
5036              change.  */
5037           if (partition.layout == 0 && layout_i != 0)
5038             {
5039               layout_costs.mark_impossible ();
5040               continue;
5041             }
5042
5043           bool is_possible = true;
5044           for (unsigned int order_i = partition.node_begin;
5045                order_i < partition.node_end; ++order_i)
5046             {
5047               unsigned int node_i = m_partitioned_nodes[order_i];
5048               auto &vertex = m_vertices[node_i];
5049
5050               /* Reject the layout if it is individually incompatible
5051                  with any node in the partition.  */
5052               if (!is_compatible_layout (vertex.node, layout_i))
5053                 {
5054                   is_possible = false;
5055                   break;
5056                 }
5057
5058               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5059                 {
5060                   auto &other_vertex = m_vertices[other_node_i];
5061                   if (other_vertex.partition < vertex.partition)
5062                     {
5063                       /* Accumulate the incoming costs from earlier
5064                          partitions, plus the cost of any layout changes
5065                          on UD itself.  */
5066                       auto cost = forward_cost (ud, other_node_i, layout_i);
5067                       if (!cost.is_possible ())
5068                         is_possible = false;
5069                       else
5070                         layout_costs.in_cost.add_parallel_cost (cost);
5071                     }
5072                   else
5073                     /* Reject the layout if it would make layout 0 impossible
5074                        for later partitions.  This amounts to testing that the
5075                        target supports reversing the layout change on edges
5076                        to later partitions.
5077
5078                        In principle, it might be possible to push a layout
5079                        change all the way down a graph, so that it never
5080                        needs to be reversed and so that the target doesn't
5081                        need to support the reverse operation.  But it would
5082                        be awkward to bail out if we hit a partition that
5083                        does not support the new layout, especially since
5084                        we are not dealing with a lattice.  */
5085                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5086                                                      layout_i).is_possible ();
5087                 };
5088               for_each_partition_edge (node_i, add_cost);
5089
5090               /* Accumulate the cost of using LAYOUT_I within NODE,
5091                  both for the inputs and the outputs.  */
5092               int factor = internal_node_cost (vertex.node, layout_i,
5093                                                layout_i);
5094               if (factor < 0)
5095                 {
5096                   is_possible = false;
5097                   break;
5098                 }
5099               else if (factor)
5100                 layout_costs.internal_cost.add_serial_cost
5101                   ({ vertex.weight * factor, m_optimize_size });
5102             }
5103           if (!is_possible)
5104             {
5105               layout_costs.mark_impossible ();
5106               continue;
5107             }
5108
5109           /* Combine the incoming and partition-internal costs.  */
5110           slpg_layout_cost combined_cost = layout_costs.in_cost;
5111           combined_cost.add_serial_cost (layout_costs.internal_cost);
5112
5113           /* If this partition consists of a single VEC_PERM_EXPR, see
5114              if the VEC_PERM_EXPR can be changed to support output layout
5115              LAYOUT_I while keeping all the provisional choices of input
5116              layout.  */
5117           if (single_node
5118               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5119             {
5120               int factor = internal_node_cost (single_node, -1, layout_i);
5121               if (factor >= 0)
5122                 {
5123                   auto weight = m_vertices[single_node->vertex].weight;
5124                   slpg_layout_cost internal_cost
5125                     = { weight * factor, m_optimize_size };
5126
5127                   slpg_layout_cost alt_cost = in_cost;
5128                   alt_cost.add_serial_cost (internal_cost);
5129                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5130                     {
5131                       combined_cost = alt_cost;
5132                       layout_costs.in_cost = in_cost;
5133                       layout_costs.internal_cost = internal_cost;
5134                     }
5135                 }
5136             }
5137
5138           /* Record the layout with the lowest cost.  Prefer layout 0 in
5139              the event of a tie between it and another layout.  */
5140           if (!min_layout_cost.is_possible ()
5141               || combined_cost.is_better_than (min_layout_cost,
5142                                                m_optimize_size))
5143             {
5144               min_layout_i = layout_i;
5145               min_layout_cost = combined_cost;
5146             }
5147         }
5148
5149       /* This loop's handling of earlier partitions should ensure that
5150          choosing the original layout for the current partition is no
5151          less valid than it was in the original graph, even with the
5152          provisional layout choices for those earlier partitions.  */
5153       gcc_assert (min_layout_cost.is_possible ());
5154       partition.layout = min_layout_i;
5155     }
5156 }
5157
5158 /* Make a backward pass through the partitions, accumulating output costs.
5159    Make a final choice of layout for each partition.  */
5160
5161 void
5162 vect_optimize_slp_pass::backward_pass ()
5163 {
5164   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5165     {
5166       auto &partition = m_partitions[partition_i];
5167
5168       unsigned int min_layout_i = 0;
5169       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5170       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5171         {
5172           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5173           if (!layout_costs.is_possible ())
5174             continue;
5175
5176           /* Accumulate the costs from successor partitions.  */
5177           bool is_possible = true;
5178           for (unsigned int order_i = partition.node_begin;
5179                order_i < partition.node_end; ++order_i)
5180             {
5181               unsigned int node_i = m_partitioned_nodes[order_i];
5182               auto &vertex = m_vertices[node_i];
5183               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5184                 {
5185                   auto &other_vertex = m_vertices[other_node_i];
5186                   auto &other_partition = m_partitions[other_vertex.partition];
5187                   if (other_vertex.partition > vertex.partition)
5188                     {
5189                       /* Accumulate the incoming costs from later
5190                          partitions, plus the cost of any layout changes
5191                          on UD itself.  */
5192                       auto cost = backward_cost (ud, other_node_i, layout_i);
5193                       if (!cost.is_possible ())
5194                         is_possible = false;
5195                       else
5196                         layout_costs.out_cost.add_parallel_cost (cost);
5197                     }
5198                   else
5199                     /* Make sure that earlier partitions can (if necessary
5200                        or beneficial) keep the layout that they chose in
5201                        the forward pass.  This ensures that there is at
5202                        least one valid choice of layout.  */
5203                     is_possible &= edge_layout_cost (ud, other_node_i,
5204                                                      other_partition.layout,
5205                                                      layout_i).is_possible ();
5206                 };
5207               for_each_partition_edge (node_i, add_cost);
5208             }
5209           if (!is_possible)
5210             {
5211               layout_costs.mark_impossible ();
5212               continue;
5213             }
5214
5215           /* Locally combine the costs from the forward and backward passes.
5216              (This combined cost is not passed on, since that would lead
5217              to double counting.)  */
5218           slpg_layout_cost combined_cost = layout_costs.in_cost;
5219           combined_cost.add_serial_cost (layout_costs.internal_cost);
5220           combined_cost.add_serial_cost (layout_costs.out_cost);
5221
5222           /* Record the layout with the lowest cost.  Prefer layout 0 in
5223              the event of a tie between it and another layout.  */
5224           if (!min_layout_cost.is_possible ()
5225               || combined_cost.is_better_than (min_layout_cost,
5226                                                m_optimize_size))
5227             {
5228               min_layout_i = layout_i;
5229               min_layout_cost = combined_cost;
5230             }
5231         }
5232
5233       gcc_assert (min_layout_cost.is_possible ());
5234       partition.layout = min_layout_i;
5235     }
5236 }
5237
5238 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5239    NODE already has the layout that was selected for its partition.  */
5240
5241 slp_tree
5242 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5243                                                 unsigned int to_layout_i)
5244 {
5245   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5246   slp_tree result = m_node_layouts[result_i];
5247   if (result)
5248     return result;
5249
5250   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5251       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5252           /* We can't permute vector defs in place.  */
5253           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5254     {
5255       /* If the vector is uniform or unchanged, there's nothing to do.  */
5256       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5257         result = node;
5258       else
5259         {
5260           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5261           result = vect_create_new_slp_node (scalar_ops);
5262           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5263         }
5264     }
5265   else
5266     {
5267       unsigned int partition_i = m_vertices[node->vertex].partition;
5268       unsigned int from_layout_i = m_partitions[partition_i].layout;
5269       if (from_layout_i == to_layout_i)
5270         return node;
5271
5272       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5273          permutation instead of a serial one.  Leave the new permutation
5274          in TMP_PERM on success.  */
5275       auto_lane_permutation_t tmp_perm;
5276       unsigned int num_inputs = 1;
5277       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5278         {
5279           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5280           if (from_layout_i != 0)
5281             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5282           if (to_layout_i != 0)
5283             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5284           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5285                                               tmp_perm,
5286                                               SLP_TREE_CHILDREN (node),
5287                                               false) >= 0)
5288             num_inputs = SLP_TREE_CHILDREN (node).length ();
5289           else
5290             tmp_perm.truncate (0);
5291         }
5292
5293       if (dump_enabled_p ())
5294         {
5295           if (tmp_perm.length () > 0)
5296             dump_printf_loc (MSG_NOTE, vect_location,
5297                              "duplicating permutation node %p with"
5298                              " layout %d\n",
5299                              (void *) node, to_layout_i);
5300           else
5301             dump_printf_loc (MSG_NOTE, vect_location,
5302                              "inserting permutation node in place of %p\n",
5303                              (void *) node);
5304         }
5305
5306       unsigned int num_lanes = SLP_TREE_LANES (node);
5307       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5308       if (SLP_TREE_SCALAR_STMTS (node).length ())
5309         {
5310           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5311           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5312           if (from_layout_i != 0)
5313             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5314           if (to_layout_i != 0)
5315             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5316         }
5317       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5318       SLP_TREE_LANES (result) = num_lanes;
5319       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5320       result->vertex = -1;
5321
5322       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5323       if (tmp_perm.length ())
5324         {
5325           lane_perm.safe_splice (tmp_perm);
5326           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5327         }
5328       else
5329         {
5330           lane_perm.create (num_lanes);
5331           for (unsigned j = 0; j < num_lanes; ++j)
5332             lane_perm.quick_push ({ 0, j });
5333           if (from_layout_i != 0)
5334             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5335           if (to_layout_i != 0)
5336             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5337           SLP_TREE_CHILDREN (result).safe_push (node);
5338         }
5339       for (slp_tree child : SLP_TREE_CHILDREN (result))
5340         child->refcnt++;
5341     }
5342   m_node_layouts[result_i] = result;
5343   return result;
5344 }
5345
5346 /* Apply the chosen vector layouts to the SLP graph.  */
5347
5348 void
5349 vect_optimize_slp_pass::materialize ()
5350 {
5351   /* We no longer need the costs, so avoid having two O(N * P) arrays
5352      live at the same time.  */
5353   m_partition_layout_costs.release ();
5354   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5355
5356   auto_sbitmap fully_folded (m_vertices.length ());
5357   bitmap_clear (fully_folded);
5358   for (unsigned int node_i : m_partitioned_nodes)
5359     {
5360       auto &vertex = m_vertices[node_i];
5361       slp_tree node = vertex.node;
5362       int layout_i = m_partitions[vertex.partition].layout;
5363       gcc_assert (layout_i >= 0);
5364
5365       /* Rearrange the scalar statements to match the chosen layout.  */
5366       if (layout_i > 0)
5367         vect_slp_permute (m_perms[layout_i],
5368                           SLP_TREE_SCALAR_STMTS (node), true);
5369
5370       /* Update load and lane permutations.  */
5371       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5372         {
5373           /* First try to absorb the input vector layouts.  If that fails,
5374              force the inputs to have layout LAYOUT_I too.  We checked that
5375              that was possible before deciding to use nonzero output layouts.
5376              (Note that at this stage we don't really have any guarantee that
5377              the target supports the original VEC_PERM_EXPR.)  */
5378           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5379           auto_lane_permutation_t tmp_perm;
5380           tmp_perm.safe_splice (perm);
5381           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5382           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5383                                               tmp_perm,
5384                                               SLP_TREE_CHILDREN (node),
5385                                               false) >= 0)
5386             {
5387               if (dump_enabled_p ()
5388                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5389                                   perm.begin ()))
5390                 dump_printf_loc (MSG_NOTE, vect_location,
5391                                  "absorbing input layouts into %p\n",
5392                                  (void *) node);
5393               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5394               bitmap_set_bit (fully_folded, node_i);
5395             }
5396           else
5397             {
5398               /* Not MSG_MISSED because it would make no sense to users.  */
5399               if (dump_enabled_p ())
5400                 dump_printf_loc (MSG_NOTE, vect_location,
5401                                  "failed to absorb input layouts into %p\n",
5402                                  (void *) node);
5403               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5404             }
5405         }
5406       else
5407         {
5408           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5409           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5410           if (layout_i > 0)
5411             /* ???  When we handle non-bijective permutes the idea
5412                is that we can force the load-permutation to be
5413                { min, min + 1, min + 2, ... max }.  But then the
5414                scalar defs might no longer match the lane content
5415                which means wrong-code with live lane vectorization.
5416                So we possibly have to have NULL entries for those.  */
5417             vect_slp_permute (m_perms[layout_i], load_perm, true);
5418         }
5419     }
5420
5421   /* Do this before any nodes disappear, since it involves a walk
5422      over the leaves.  */
5423   remove_redundant_permutations ();
5424
5425   /* Replace each child with a correctly laid-out version.  */
5426   for (unsigned int node_i : m_partitioned_nodes)
5427     {
5428       /* Skip nodes that have already been handled above.  */
5429       if (bitmap_bit_p (fully_folded, node_i))
5430         continue;
5431
5432       auto &vertex = m_vertices[node_i];
5433       int in_layout_i = m_partitions[vertex.partition].layout;
5434       gcc_assert (in_layout_i >= 0);
5435
5436       unsigned j;
5437       slp_tree child;
5438       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5439         {
5440           if (!child)
5441             continue;
5442
5443           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5444           if (new_child != child)
5445             {
5446               vect_free_slp_tree (child);
5447               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5448               new_child->refcnt += 1;
5449             }
5450         }
5451     }
5452 }
5453
5454 /* Elide load permutations that are not necessary.  Such permutations might
5455    be pre-existing, rather than created by the layout optimizations.  */
5456
5457 void
5458 vect_optimize_slp_pass::remove_redundant_permutations ()
5459 {
5460   for (unsigned int node_i : m_leafs)
5461     {
5462       slp_tree node = m_vertices[node_i].node;
5463       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5464         continue;
5465
5466       /* In basic block vectorization we allow any subchain of an interleaving
5467          chain.
5468          FORNOW: not in loop SLP because of realignment complications.  */
5469       if (is_a <bb_vec_info> (m_vinfo))
5470         {
5471           bool subchain_p = true;
5472           stmt_vec_info next_load_info = NULL;
5473           stmt_vec_info load_info;
5474           unsigned j;
5475           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5476             {
5477               if (j != 0
5478                   && (next_load_info != load_info
5479                       || DR_GROUP_GAP (load_info) != 1))
5480                 {
5481                   subchain_p = false;
5482                   break;
5483                 }
5484               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5485             }
5486           if (subchain_p)
5487             {
5488               SLP_TREE_LOAD_PERMUTATION (node).release ();
5489               continue;
5490             }
5491         }
5492       else
5493         {
5494           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5495           stmt_vec_info load_info;
5496           bool this_load_permuted = false;
5497           unsigned j;
5498           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5499             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5500               {
5501                 this_load_permuted = true;
5502                 break;
5503               }
5504           /* When this isn't a grouped access we know it's single element
5505              and contiguous.  */
5506           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5507             {
5508               if (!this_load_permuted
5509                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5510                       || SLP_TREE_LANES (node) == 1))
5511                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5512               continue;
5513             }
5514           stmt_vec_info first_stmt_info
5515             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5516           if (!this_load_permuted
5517               /* The load requires permutation when unrolling exposes
5518                  a gap either because the group is larger than the SLP
5519                  group-size or because there is a gap between the groups.  */
5520               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5521                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5522                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5523             {
5524               SLP_TREE_LOAD_PERMUTATION (node).release ();
5525               continue;
5526             }
5527         }
5528     }
5529 }
5530
5531 /* Print the partition graph and layout information to the dump file.  */
5532
5533 void
5534 vect_optimize_slp_pass::dump ()
5535 {
5536   dump_printf_loc (MSG_NOTE, vect_location,
5537                    "SLP optimize permutations:\n");
5538   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5539     {
5540       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5541       const char *sep = "";
5542       for (unsigned int idx : m_perms[layout_i])
5543         {
5544           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5545           sep = ", ";
5546         }
5547       dump_printf (MSG_NOTE, " }\n");
5548     }
5549   dump_printf_loc (MSG_NOTE, vect_location,
5550                    "SLP optimize partitions:\n");
5551   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5552        ++partition_i)
5553     {
5554       auto &partition = m_partitions[partition_i];
5555       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5556       dump_printf_loc (MSG_NOTE, vect_location,
5557                        "  partition %d (layout %d):\n",
5558                        partition_i, partition.layout);
5559       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5560       for (unsigned int order_i = partition.node_begin;
5561            order_i < partition.node_end; ++order_i)
5562         {
5563           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5564           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5565                            (void *) vertex.node);
5566           dump_printf_loc (MSG_NOTE, vect_location,
5567                            "          weight: %f\n",
5568                            vertex.weight.to_double ());
5569           if (vertex.out_degree)
5570             dump_printf_loc (MSG_NOTE, vect_location,
5571                              "          out weight: %f (degree %d)\n",
5572                              vertex.out_weight.to_double (),
5573                              vertex.out_degree);
5574           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5575             dump_printf_loc (MSG_NOTE, vect_location,
5576                              "          op: VEC_PERM_EXPR\n");
5577           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5578             dump_printf_loc (MSG_NOTE, vect_location,
5579                              "          op template: %G", rep->stmt);
5580         }
5581       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5582       for (unsigned int order_i = partition.node_begin;
5583            order_i < partition.node_end; ++order_i)
5584         {
5585           unsigned int node_i = m_partitioned_nodes[order_i];
5586           auto &vertex = m_vertices[node_i];
5587           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5588             {
5589               auto &other_vertex = m_vertices[other_node_i];
5590               if (other_vertex.partition < vertex.partition)
5591                 dump_printf_loc (MSG_NOTE, vect_location,
5592                                  "      - %p [%d] --> %p\n",
5593                                  (void *) other_vertex.node,
5594                                  other_vertex.partition,
5595                                  (void *) vertex.node);
5596               else
5597                 dump_printf_loc (MSG_NOTE, vect_location,
5598                                  "      - %p --> [%d] %p\n",
5599                                  (void *) vertex.node,
5600                                  other_vertex.partition,
5601                                  (void *) other_vertex.node);
5602             };
5603           for_each_partition_edge (node_i, print_edge);
5604         }
5605
5606       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5607         {
5608           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5609           if (layout_costs.is_possible ())
5610             {
5611               dump_printf_loc (MSG_NOTE, vect_location,
5612                                "    layout %d:%s\n", layout_i,
5613                                partition.layout == int (layout_i)
5614                                ? " (*)" : "");
5615               slpg_layout_cost combined_cost = layout_costs.in_cost;
5616               combined_cost.add_serial_cost (layout_costs.internal_cost);
5617               combined_cost.add_serial_cost (layout_costs.out_cost);
5618 #define TEMPLATE "{depth: %f, total: %f}"
5619               dump_printf_loc (MSG_NOTE, vect_location,
5620                                "        " TEMPLATE "\n",
5621                                layout_costs.in_cost.depth.to_double (),
5622                                layout_costs.in_cost.total.to_double ());
5623               dump_printf_loc (MSG_NOTE, vect_location,
5624                                "      + " TEMPLATE "\n",
5625                                layout_costs.internal_cost.depth.to_double (),
5626                                layout_costs.internal_cost.total.to_double ());
5627               dump_printf_loc (MSG_NOTE, vect_location,
5628                                "      + " TEMPLATE "\n",
5629                                layout_costs.out_cost.depth.to_double (),
5630                                layout_costs.out_cost.total.to_double ());
5631               dump_printf_loc (MSG_NOTE, vect_location,
5632                                "      = " TEMPLATE "\n",
5633                                combined_cost.depth.to_double (),
5634                                combined_cost.total.to_double ());
5635 #undef TEMPLATE
5636             }
5637           else
5638             dump_printf_loc (MSG_NOTE, vect_location,
5639                              "    layout %d: rejected\n", layout_i);
5640         }
5641     }
5642 }
5643
5644 /* Main entry point for the SLP graph optimization pass.  */
5645
5646 void
5647 vect_optimize_slp_pass::run ()
5648 {
5649   build_graph ();
5650   create_partitions ();
5651   start_choosing_layouts ();
5652   if (m_perms.length () > 1)
5653     {
5654       forward_pass ();
5655       backward_pass ();
5656       if (dump_enabled_p ())
5657         dump ();
5658       materialize ();
5659       while (!m_perms.is_empty ())
5660         m_perms.pop ().release ();
5661     }
5662   else
5663     remove_redundant_permutations ();
5664   free_graph (m_slpg);
5665 }
5666
5667 /* Optimize the SLP graph of VINFO.  */
5668
5669 void
5670 vect_optimize_slp (vec_info *vinfo)
5671 {
5672   if (vinfo->slp_instances.is_empty ())
5673     return;
5674   vect_optimize_slp_pass (vinfo).run ();
5675 }
5676
5677 /* Gather loads reachable from the individual SLP graph entries.  */
5678
5679 void
5680 vect_gather_slp_loads (vec_info *vinfo)
5681 {
5682   unsigned i;
5683   slp_instance instance;
5684   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5685     {
5686       hash_set<slp_tree> visited;
5687       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5688                              SLP_INSTANCE_TREE (instance), visited);
5689     }
5690 }
5691
5692
5693 /* For each possible SLP instance decide whether to SLP it and calculate overall
5694    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5695    least one instance.  */
5696
5697 bool
5698 vect_make_slp_decision (loop_vec_info loop_vinfo)
5699 {
5700   unsigned int i;
5701   poly_uint64 unrolling_factor = 1;
5702   const vec<slp_instance> &slp_instances
5703     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5704   slp_instance instance;
5705   int decided_to_slp = 0;
5706
5707   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5708
5709   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5710     {
5711       /* FORNOW: SLP if you can.  */
5712       /* All unroll factors have the form:
5713
5714            GET_MODE_SIZE (vinfo->vector_mode) * X
5715
5716          for some rational X, so they must have a common multiple.  */
5717       unrolling_factor
5718         = force_common_multiple (unrolling_factor,
5719                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5720
5721       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5722          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5723          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5724       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5725       decided_to_slp++;
5726     }
5727
5728   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5729
5730   if (decided_to_slp && dump_enabled_p ())
5731     {
5732       dump_printf_loc (MSG_NOTE, vect_location,
5733                        "Decided to SLP %d instances. Unrolling factor ",
5734                        decided_to_slp);
5735       dump_dec (MSG_NOTE, unrolling_factor);
5736       dump_printf (MSG_NOTE, "\n");
5737     }
5738
5739   return (decided_to_slp > 0);
5740 }
5741
5742 /* Private data for vect_detect_hybrid_slp.  */
5743 struct vdhs_data
5744 {
5745   loop_vec_info loop_vinfo;
5746   vec<stmt_vec_info> *worklist;
5747 };
5748
5749 /* Walker for walk_gimple_op.  */
5750
5751 static tree
5752 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5753 {
5754   walk_stmt_info *wi = (walk_stmt_info *)data;
5755   vdhs_data *dat = (vdhs_data *)wi->info;
5756
5757   if (wi->is_lhs)
5758     return NULL_TREE;
5759
5760   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5761   if (!def_stmt_info)
5762     return NULL_TREE;
5763   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5764   if (PURE_SLP_STMT (def_stmt_info))
5765     {
5766       if (dump_enabled_p ())
5767         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5768                          def_stmt_info->stmt);
5769       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5770       dat->worklist->safe_push (def_stmt_info);
5771     }
5772
5773   return NULL_TREE;
5774 }
5775
5776 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5777    if so, otherwise pushing it to WORKLIST.  */
5778
5779 static void
5780 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5781                                vec<stmt_vec_info> &worklist,
5782                                stmt_vec_info stmt_info)
5783 {
5784   if (dump_enabled_p ())
5785     dump_printf_loc (MSG_NOTE, vect_location,
5786                      "Processing hybrid candidate : %G", stmt_info->stmt);
5787   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5788   imm_use_iterator iter2;
5789   ssa_op_iter iter1;
5790   use_operand_p use_p;
5791   def_operand_p def_p;
5792   bool any_def = false;
5793   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5794     {
5795       any_def = true;
5796       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5797         {
5798           if (is_gimple_debug (USE_STMT (use_p)))
5799             continue;
5800           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5801           /* An out-of loop use means this is a loop_vect sink.  */
5802           if (!use_info)
5803             {
5804               if (dump_enabled_p ())
5805                 dump_printf_loc (MSG_NOTE, vect_location,
5806                                  "Found loop_vect sink: %G", stmt_info->stmt);
5807               worklist.safe_push (stmt_info);
5808               return;
5809             }
5810           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5811             {
5812               if (dump_enabled_p ())
5813                 dump_printf_loc (MSG_NOTE, vect_location,
5814                                  "Found loop_vect use: %G", use_info->stmt);
5815               worklist.safe_push (stmt_info);
5816               return;
5817             }
5818         }
5819     }
5820   /* No def means this is a loo_vect sink.  */
5821   if (!any_def)
5822     {
5823       if (dump_enabled_p ())
5824         dump_printf_loc (MSG_NOTE, vect_location,
5825                          "Found loop_vect sink: %G", stmt_info->stmt);
5826       worklist.safe_push (stmt_info);
5827       return;
5828     }
5829   if (dump_enabled_p ())
5830     dump_printf_loc (MSG_NOTE, vect_location,
5831                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5832   STMT_SLP_TYPE (stmt_info) = pure_slp;
5833 }
5834
5835 /* Find stmts that must be both vectorized and SLPed.  */
5836
5837 void
5838 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5839 {
5840   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5841
5842   /* All stmts participating in SLP are marked pure_slp, all other
5843      stmts are loop_vect.
5844      First collect all loop_vect stmts into a worklist.
5845      SLP patterns cause not all original scalar stmts to appear in
5846      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5847      Rectify this here and do a backward walk over the IL only considering
5848      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5849      mark them as pure_slp.  */
5850   auto_vec<stmt_vec_info> worklist;
5851   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5852     {
5853       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5854       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5855            gsi_next (&gsi))
5856         {
5857           gphi *phi = gsi.phi ();
5858           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5859           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5860             maybe_push_to_hybrid_worklist (loop_vinfo,
5861                                            worklist, stmt_info);
5862         }
5863       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5864            gsi_prev (&gsi))
5865         {
5866           gimple *stmt = gsi_stmt (gsi);
5867           if (is_gimple_debug (stmt))
5868             continue;
5869           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5870           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5871             {
5872               for (gimple_stmt_iterator gsi2
5873                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5874                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5875                 {
5876                   stmt_vec_info patt_info
5877                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5878                   if (!STMT_SLP_TYPE (patt_info)
5879                       && STMT_VINFO_RELEVANT (patt_info))
5880                     maybe_push_to_hybrid_worklist (loop_vinfo,
5881                                                    worklist, patt_info);
5882                 }
5883               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5884             }
5885           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5886             maybe_push_to_hybrid_worklist (loop_vinfo,
5887                                            worklist, stmt_info);
5888         }
5889     }
5890
5891   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5892      mark any SLP vectorized stmt as hybrid.
5893      ???  We're visiting def stmts N times (once for each non-SLP and
5894      once for each hybrid-SLP use).  */
5895   walk_stmt_info wi;
5896   vdhs_data dat;
5897   dat.worklist = &worklist;
5898   dat.loop_vinfo = loop_vinfo;
5899   memset (&wi, 0, sizeof (wi));
5900   wi.info = (void *)&dat;
5901   while (!worklist.is_empty ())
5902     {
5903       stmt_vec_info stmt_info = worklist.pop ();
5904       /* Since SSA operands are not set up for pattern stmts we need
5905          to use walk_gimple_op.  */
5906       wi.is_lhs = 0;
5907       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5908       /* For gather/scatter make sure to walk the offset operand, that
5909          can be a scaling and conversion away.  */
5910       gather_scatter_info gs_info;
5911       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5912           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5913         {
5914           int dummy;
5915           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5916         }
5917     }
5918 }
5919
5920
5921 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5922
5923 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5924   : vec_info (vec_info::bb, shared),
5925     bbs (_bbs),
5926     roots (vNULL)
5927 {
5928   for (unsigned i = 0; i < bbs.length (); ++i)
5929     {
5930       if (i != 0)
5931         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5932              gsi_next (&si))
5933           {
5934             gphi *phi = si.phi ();
5935             gimple_set_uid (phi, 0);
5936             add_stmt (phi);
5937           }
5938       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5939            !gsi_end_p (gsi); gsi_next (&gsi))
5940         {
5941           gimple *stmt = gsi_stmt (gsi);
5942           gimple_set_uid (stmt, 0);
5943           if (is_gimple_debug (stmt))
5944             continue;
5945           add_stmt (stmt);
5946         }
5947     }
5948 }
5949
5950
5951 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5952    stmts in the basic block.  */
5953
5954 _bb_vec_info::~_bb_vec_info ()
5955 {
5956   /* Reset region marker.  */
5957   for (unsigned i = 0; i < bbs.length (); ++i)
5958     {
5959       if (i != 0)
5960         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5961              gsi_next (&si))
5962           {
5963             gphi *phi = si.phi ();
5964             gimple_set_uid (phi, -1);
5965           }
5966       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5967            !gsi_end_p (gsi); gsi_next (&gsi))
5968         {
5969           gimple *stmt = gsi_stmt (gsi);
5970           gimple_set_uid (stmt, -1);
5971         }
5972     }
5973
5974   for (unsigned i = 0; i < roots.length (); ++i)
5975     {
5976       roots[i].stmts.release ();
5977       roots[i].roots.release ();
5978       roots[i].remain.release ();
5979     }
5980   roots.release ();
5981 }
5982
5983 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5984    given then that child nodes have already been processed, and that
5985    their def types currently match their SLP node's def type.  */
5986
5987 static bool
5988 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5989                                     slp_instance node_instance,
5990                                     stmt_vector_for_cost *cost_vec)
5991 {
5992   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5993
5994   /* Calculate the number of vector statements to be created for the
5995      scalar stmts in this node.  For SLP reductions it is equal to the
5996      number of vector statements in the children (which has already been
5997      calculated by the recursive call).  Otherwise it is the number of
5998      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5999      VF divided by the number of elements in a vector.  */
6000   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6001       && !STMT_VINFO_DATA_REF (stmt_info)
6002       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6003     {
6004       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6005         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6006           {
6007             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6008               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6009             break;
6010           }
6011     }
6012   else
6013     {
6014       poly_uint64 vf;
6015       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6016         vf = loop_vinfo->vectorization_factor;
6017       else
6018         vf = 1;
6019       unsigned int group_size = SLP_TREE_LANES (node);
6020       tree vectype = SLP_TREE_VECTYPE (node);
6021       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6022         = vect_get_num_vectors (vf * group_size, vectype);
6023     }
6024
6025   /* Handle purely internal nodes.  */
6026   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6027     {
6028       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6029         return false;
6030
6031       stmt_vec_info slp_stmt_info;
6032       unsigned int i;
6033       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6034         {
6035           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6036               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6037                                                node_instance, i,
6038                                                false, cost_vec))
6039             return false;
6040         }
6041       return true;
6042     }
6043
6044   bool dummy;
6045   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6046                             node, node_instance, cost_vec);
6047 }
6048
6049 /* Try to build NODE from scalars, returning true on success.
6050    NODE_INSTANCE is the SLP instance that contains NODE.  */
6051
6052 static bool
6053 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6054                               slp_instance node_instance)
6055 {
6056   stmt_vec_info stmt_info;
6057   unsigned int i;
6058
6059   if (!is_a <bb_vec_info> (vinfo)
6060       || node == SLP_INSTANCE_TREE (node_instance)
6061       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6062       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6063       /* Force the mask use to be built from scalars instead.  */
6064       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6065     return false;
6066
6067   if (dump_enabled_p ())
6068     dump_printf_loc (MSG_NOTE, vect_location,
6069                      "Building vector operands of %p from scalars instead\n",
6070                      (void *) node);
6071
6072   /* Don't remove and free the child nodes here, since they could be
6073      referenced by other structures.  The analysis and scheduling phases
6074      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6075   unsigned int group_size = SLP_TREE_LANES (node);
6076   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6077   /* Invariants get their vector type from the uses.  */
6078   SLP_TREE_VECTYPE (node) = NULL_TREE;
6079   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6080   SLP_TREE_LOAD_PERMUTATION (node).release ();
6081   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6082     {
6083       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6084       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6085     }
6086   return true;
6087 }
6088
6089 /* Return true if all elements of the slice are the same.  */
6090 bool
6091 vect_scalar_ops_slice::all_same_p () const
6092 {
6093   for (unsigned int i = 1; i < length; ++i)
6094     if (!operand_equal_p (op (0), op (i)))
6095       return false;
6096   return true;
6097 }
6098
6099 hashval_t
6100 vect_scalar_ops_slice_hash::hash (const value_type &s)
6101 {
6102   hashval_t hash = 0;
6103   for (unsigned i = 0; i < s.length; ++i)
6104     hash = iterative_hash_expr (s.op (i), hash);
6105   return hash;
6106 }
6107
6108 bool
6109 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6110                                    const compare_type &s2)
6111 {
6112   if (s1.length != s2.length)
6113     return false;
6114   for (unsigned i = 0; i < s1.length; ++i)
6115     if (!operand_equal_p (s1.op (i), s2.op (i)))
6116       return false;
6117   return true;
6118 }
6119
6120 /* Compute the prologue cost for invariant or constant operands represented
6121    by NODE.  */
6122
6123 static void
6124 vect_prologue_cost_for_slp (slp_tree node,
6125                             stmt_vector_for_cost *cost_vec)
6126 {
6127   /* There's a special case of an existing vector, that costs nothing.  */
6128   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6129       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6130     return;
6131   /* Without looking at the actual initializer a vector of
6132      constants can be implemented as load from the constant pool.
6133      When all elements are the same we can use a splat.  */
6134   tree vectype = SLP_TREE_VECTYPE (node);
6135   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6136   unsigned HOST_WIDE_INT const_nunits;
6137   unsigned nelt_limit;
6138   auto ops = &SLP_TREE_SCALAR_OPS (node);
6139   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6140   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6141       && ! multiple_p (const_nunits, group_size))
6142     {
6143       nelt_limit = const_nunits;
6144       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6145       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6146         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6147           starts.quick_push (i * const_nunits);
6148     }
6149   else
6150     {
6151       /* If either the vector has variable length or the vectors
6152          are composed of repeated whole groups we only need to
6153          cost construction once.  All vectors will be the same.  */
6154       nelt_limit = group_size;
6155       starts.quick_push (0);
6156     }
6157   /* ???  We're just tracking whether vectors in a single node are the same.
6158      Ideally we'd do something more global.  */
6159   bool passed = false;
6160   for (unsigned int start : starts)
6161     {
6162       vect_cost_for_stmt kind;
6163       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6164         kind = vector_load;
6165       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6166         kind = scalar_to_vec;
6167       else
6168         kind = vec_construct;
6169       /* The target cost hook has no idea which part of the SLP node
6170          we are costing so avoid passing it down more than once.  Pass
6171          it to the first vec_construct or scalar_to_vec part since for those
6172          the x86 backend tries to account for GPR to XMM register moves.  */
6173       record_stmt_cost (cost_vec, 1, kind,
6174                         (kind != vector_load && !passed) ? node : nullptr,
6175                         vectype, 0, vect_prologue);
6176       if (kind != vector_load)
6177         passed = true;
6178     }
6179 }
6180
6181 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6182    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6183
6184    Return true if the operations are supported.  */
6185
6186 static bool
6187 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6188                                   slp_instance node_instance,
6189                                   hash_set<slp_tree> &visited_set,
6190                                   vec<slp_tree> &visited_vec,
6191                                   stmt_vector_for_cost *cost_vec)
6192 {
6193   int i, j;
6194   slp_tree child;
6195
6196   /* Assume we can code-generate all invariants.  */
6197   if (!node
6198       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6199       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6200     return true;
6201
6202   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6203     {
6204       if (dump_enabled_p ())
6205         dump_printf_loc (MSG_NOTE, vect_location,
6206                          "Failed cyclic SLP reference in %p\n", (void *) node);
6207       return false;
6208     }
6209   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6210
6211   /* If we already analyzed the exact same set of scalar stmts we're done.
6212      We share the generated vector stmts for those.  */
6213   if (visited_set.add (node))
6214     return true;
6215   visited_vec.safe_push (node);
6216
6217   bool res = true;
6218   unsigned visited_rec_start = visited_vec.length ();
6219   unsigned cost_vec_rec_start = cost_vec->length ();
6220   bool seen_non_constant_child = false;
6221   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6222     {
6223       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6224                                               visited_set, visited_vec,
6225                                               cost_vec);
6226       if (!res)
6227         break;
6228       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6229         seen_non_constant_child = true;
6230     }
6231   /* We're having difficulties scheduling nodes with just constant
6232      operands and no scalar stmts since we then cannot compute a stmt
6233      insertion place.  */
6234   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6235     {
6236       if (dump_enabled_p ())
6237         dump_printf_loc (MSG_NOTE, vect_location,
6238                          "Cannot vectorize all-constant op node %p\n",
6239                          (void *) node);
6240       res = false;
6241     }
6242
6243   if (res)
6244     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6245                                               cost_vec);
6246   /* If analysis failed we have to pop all recursive visited nodes
6247      plus ourselves.  */
6248   if (!res)
6249     {
6250       while (visited_vec.length () >= visited_rec_start)
6251         visited_set.remove (visited_vec.pop ());
6252       cost_vec->truncate (cost_vec_rec_start);
6253     }
6254
6255   /* When the node can be vectorized cost invariant nodes it references.
6256      This is not done in DFS order to allow the refering node
6257      vectorizable_* calls to nail down the invariant nodes vector type
6258      and possibly unshare it if it needs a different vector type than
6259      other referrers.  */
6260   if (res)
6261     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6262       if (child
6263           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6264               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6265           /* Perform usual caching, note code-generation still
6266              code-gens these nodes multiple times but we expect
6267              to CSE them later.  */
6268           && !visited_set.add (child))
6269         {
6270           visited_vec.safe_push (child);
6271           /* ???  After auditing more code paths make a "default"
6272              and push the vector type from NODE to all children
6273              if it is not already set.  */
6274           /* Compute the number of vectors to be generated.  */
6275           tree vector_type = SLP_TREE_VECTYPE (child);
6276           if (!vector_type)
6277             {
6278               /* For shifts with a scalar argument we don't need
6279                  to cost or code-generate anything.
6280                  ???  Represent this more explicitely.  */
6281               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6282                            == shift_vec_info_type)
6283                           && j == 1);
6284               continue;
6285             }
6286           unsigned group_size = SLP_TREE_LANES (child);
6287           poly_uint64 vf = 1;
6288           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6289             vf = loop_vinfo->vectorization_factor;
6290           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6291             = vect_get_num_vectors (vf * group_size, vector_type);
6292           /* And cost them.  */
6293           vect_prologue_cost_for_slp (child, cost_vec);
6294         }
6295
6296   /* If this node or any of its children can't be vectorized, try pruning
6297      the tree here rather than felling the whole thing.  */
6298   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6299     {
6300       /* We'll need to revisit this for invariant costing and number
6301          of vectorized stmt setting.   */
6302       res = true;
6303     }
6304
6305   return res;
6306 }
6307
6308 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6309    region and that can be vectorized using vectorizable_live_operation
6310    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6311    scalar code computing it to be retained.  */
6312
6313 static void
6314 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6315                              slp_instance instance,
6316                              stmt_vector_for_cost *cost_vec,
6317                              hash_set<stmt_vec_info> &svisited,
6318                              hash_set<slp_tree> &visited)
6319 {
6320   if (visited.add (node))
6321     return;
6322
6323   unsigned i;
6324   stmt_vec_info stmt_info;
6325   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6326   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6327     {
6328       if (svisited.contains (stmt_info))
6329         continue;
6330       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6331       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6332           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6333         /* Only the pattern root stmt computes the original scalar value.  */
6334         continue;
6335       bool mark_visited = true;
6336       gimple *orig_stmt = orig_stmt_info->stmt;
6337       ssa_op_iter op_iter;
6338       def_operand_p def_p;
6339       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6340         {
6341           imm_use_iterator use_iter;
6342           gimple *use_stmt;
6343           stmt_vec_info use_stmt_info;
6344           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6345             if (!is_gimple_debug (use_stmt))
6346               {
6347                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6348                 if (!use_stmt_info
6349                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6350                   {
6351                     STMT_VINFO_LIVE_P (stmt_info) = true;
6352                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6353                                                      node, instance, i,
6354                                                      false, cost_vec))
6355                       /* ???  So we know we can vectorize the live stmt
6356                          from one SLP node.  If we cannot do so from all
6357                          or none consistently we'd have to record which
6358                          SLP node (and lane) we want to use for the live
6359                          operation.  So make sure we can code-generate
6360                          from all nodes.  */
6361                       mark_visited = false;
6362                     else
6363                       STMT_VINFO_LIVE_P (stmt_info) = false;
6364                     break;
6365                   }
6366               }
6367           /* We have to verify whether we can insert the lane extract
6368              before all uses.  The following is a conservative approximation.
6369              We cannot put this into vectorizable_live_operation because
6370              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6371              doesn't work.
6372              Note that while the fact that we emit code for loads at the
6373              first load should make this a non-problem leafs we construct
6374              from scalars are vectorized after the last scalar def.
6375              ???  If we'd actually compute the insert location during
6376              analysis we could use sth less conservative than the last
6377              scalar stmt in the node for the dominance check.  */
6378           /* ???  What remains is "live" uses in vector CTORs in the same
6379              SLP graph which is where those uses can end up code-generated
6380              right after their definition instead of close to their original
6381              use.  But that would restrict us to code-generate lane-extracts
6382              from the latest stmt in a node.  So we compensate for this
6383              during code-generation, simply not replacing uses for those
6384              hopefully rare cases.  */
6385           if (STMT_VINFO_LIVE_P (stmt_info))
6386             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6387               if (!is_gimple_debug (use_stmt)
6388                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6389                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6390                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6391                 {
6392                   if (dump_enabled_p ())
6393                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6394                                      "Cannot determine insertion place for "
6395                                      "lane extract\n");
6396                   STMT_VINFO_LIVE_P (stmt_info) = false;
6397                   mark_visited = true;
6398                 }
6399         }
6400       if (mark_visited)
6401         svisited.add (stmt_info);
6402     }
6403
6404   slp_tree child;
6405   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6406     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6407       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6408                                    cost_vec, svisited, visited);
6409 }
6410
6411 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6412
6413 static bool
6414 vectorizable_bb_reduc_epilogue (slp_instance instance,
6415                                 stmt_vector_for_cost *cost_vec)
6416 {
6417   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6418   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6419   if (reduc_code == MINUS_EXPR)
6420     reduc_code = PLUS_EXPR;
6421   internal_fn reduc_fn;
6422   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6423   if (!vectype
6424       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6425       || reduc_fn == IFN_LAST
6426       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6427       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6428                                      TREE_TYPE (vectype)))
6429     {
6430       if (dump_enabled_p ())
6431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6432                          "not vectorized: basic block reduction epilogue "
6433                          "operation unsupported.\n");
6434       return false;
6435     }
6436
6437   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6438      cost log2 vector operations plus shuffles and one extraction.  */
6439   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6440   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6441                     vectype, 0, vect_body);
6442   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6443                     vectype, 0, vect_body);
6444   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6445                     vectype, 0, vect_body);
6446
6447   /* Since we replace all stmts of a possibly longer scalar reduction
6448      chain account for the extra scalar stmts for that.  */
6449   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6450                     instance->root_stmts[0], 0, vect_body);
6451   return true;
6452 }
6453
6454 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6455    and recurse to children.  */
6456
6457 static void
6458 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6459                               hash_set<slp_tree> &visited)
6460 {
6461   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6462       || visited.add (node))
6463     return;
6464
6465   stmt_vec_info stmt;
6466   unsigned i;
6467   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6468     roots.remove (vect_orig_stmt (stmt));
6469
6470   slp_tree child;
6471   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6472     if (child)
6473       vect_slp_prune_covered_roots (child, roots, visited);
6474 }
6475
6476 /* Analyze statements in SLP instances of VINFO.  Return true if the
6477    operations are supported. */
6478
6479 bool
6480 vect_slp_analyze_operations (vec_info *vinfo)
6481 {
6482   slp_instance instance;
6483   int i;
6484
6485   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6486
6487   hash_set<slp_tree> visited;
6488   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6489     {
6490       auto_vec<slp_tree> visited_vec;
6491       stmt_vector_for_cost cost_vec;
6492       cost_vec.create (2);
6493       if (is_a <bb_vec_info> (vinfo))
6494         vect_location = instance->location ();
6495       if (!vect_slp_analyze_node_operations (vinfo,
6496                                              SLP_INSTANCE_TREE (instance),
6497                                              instance, visited, visited_vec,
6498                                              &cost_vec)
6499           /* CTOR instances require vectorized defs for the SLP tree root.  */
6500           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6501               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6502                   != vect_internal_def
6503                   /* Make sure we vectorized with the expected type.  */
6504                   || !useless_type_conversion_p
6505                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6506                                               (instance->root_stmts[0]->stmt))),
6507                          TREE_TYPE (SLP_TREE_VECTYPE
6508                                             (SLP_INSTANCE_TREE (instance))))))
6509           /* Check we can vectorize the reduction.  */
6510           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6511               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6512         {
6513           slp_tree node = SLP_INSTANCE_TREE (instance);
6514           stmt_vec_info stmt_info;
6515           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6516             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6517           else
6518             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6519           if (dump_enabled_p ())
6520             dump_printf_loc (MSG_NOTE, vect_location,
6521                              "removing SLP instance operations starting from: %G",
6522                              stmt_info->stmt);
6523           vect_free_slp_instance (instance);
6524           vinfo->slp_instances.ordered_remove (i);
6525           cost_vec.release ();
6526           while (!visited_vec.is_empty ())
6527             visited.remove (visited_vec.pop ());
6528         }
6529       else
6530         {
6531           i++;
6532           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6533             {
6534               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6535               cost_vec.release ();
6536             }
6537           else
6538             /* For BB vectorization remember the SLP graph entry
6539                cost for later.  */
6540             instance->cost_vec = cost_vec;
6541         }
6542     }
6543
6544   /* Now look for SLP instances with a root that are covered by other
6545      instances and remove them.  */
6546   hash_set<stmt_vec_info> roots;
6547   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6548     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6549       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6550   if (!roots.is_empty ())
6551     {
6552       visited.empty ();
6553       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6554         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6555                                       visited);
6556       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6557         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6558             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6559           {
6560             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6561             if (dump_enabled_p ())
6562               dump_printf_loc (MSG_NOTE, vect_location,
6563                                "removing SLP instance operations starting "
6564                                "from: %G", root->stmt);
6565             vect_free_slp_instance (instance);
6566             vinfo->slp_instances.ordered_remove (i);
6567           }
6568         else
6569           ++i;
6570     }
6571
6572   /* Compute vectorizable live stmts.  */
6573   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6574     {
6575       hash_set<stmt_vec_info> svisited;
6576       hash_set<slp_tree> visited;
6577       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6578         {
6579           vect_location = instance->location ();
6580           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6581                                        instance, &instance->cost_vec, svisited,
6582                                        visited);
6583         }
6584     }
6585
6586   return !vinfo->slp_instances.is_empty ();
6587 }
6588
6589 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6590    closing the eventual chain.  */
6591
6592 static slp_instance
6593 get_ultimate_leader (slp_instance instance,
6594                      hash_map<slp_instance, slp_instance> &instance_leader)
6595 {
6596   auto_vec<slp_instance *, 8> chain;
6597   slp_instance *tem;
6598   while (*(tem = instance_leader.get (instance)) != instance)
6599     {
6600       chain.safe_push (tem);
6601       instance = *tem;
6602     }
6603   while (!chain.is_empty ())
6604     *chain.pop () = instance;
6605   return instance;
6606 }
6607
6608 namespace {
6609 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6610    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6611    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6612
6613    INSTANCE_LEADER is as for get_ultimate_leader.  */
6614
6615 template<typename T>
6616 bool
6617 vect_map_to_instance (slp_instance instance, T key,
6618                       hash_map<T, slp_instance> &key_to_instance,
6619                       hash_map<slp_instance, slp_instance> &instance_leader)
6620 {
6621   bool existed_p;
6622   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6623   if (!existed_p)
6624     ;
6625   else if (key_instance != instance)
6626     {
6627       /* If we're running into a previously marked key make us the
6628          leader of the current ultimate leader.  This keeps the
6629          leader chain acyclic and works even when the current instance
6630          connects two previously independent graph parts.  */
6631       slp_instance key_leader
6632         = get_ultimate_leader (key_instance, instance_leader);
6633       if (key_leader != instance)
6634         instance_leader.put (key_leader, instance);
6635     }
6636   key_instance = instance;
6637   return existed_p;
6638 }
6639 }
6640
6641 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6642
6643 static void
6644 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6645                            slp_instance instance, slp_tree node,
6646                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6647                            hash_map<slp_tree, slp_instance> &node_to_instance,
6648                            hash_map<slp_instance, slp_instance> &instance_leader)
6649 {
6650   stmt_vec_info stmt_info;
6651   unsigned i;
6652
6653   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6654     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6655                           instance_leader);
6656
6657   if (vect_map_to_instance (instance, node, node_to_instance,
6658                             instance_leader))
6659     return;
6660
6661   slp_tree child;
6662   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6663     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6664       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6665                                  node_to_instance, instance_leader);
6666 }
6667
6668 /* Partition the SLP graph into pieces that can be costed independently.  */
6669
6670 static void
6671 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6672 {
6673   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6674
6675   /* First walk the SLP graph assigning each involved scalar stmt a
6676      corresponding SLP graph entry and upon visiting a previously
6677      marked stmt, make the stmts leader the current SLP graph entry.  */
6678   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6679   hash_map<slp_tree, slp_instance> node_to_instance;
6680   hash_map<slp_instance, slp_instance> instance_leader;
6681   slp_instance instance;
6682   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6683     {
6684       instance_leader.put (instance, instance);
6685       vect_bb_partition_graph_r (bb_vinfo,
6686                                  instance, SLP_INSTANCE_TREE (instance),
6687                                  stmt_to_instance, node_to_instance,
6688                                  instance_leader);
6689     }
6690
6691   /* Then collect entries to each independent subgraph.  */
6692   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6693     {
6694       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6695       leader->subgraph_entries.safe_push (instance);
6696       if (dump_enabled_p ()
6697           && leader != instance)
6698         dump_printf_loc (MSG_NOTE, vect_location,
6699                          "instance %p is leader of %p\n",
6700                          (void *) leader, (void *) instance);
6701     }
6702 }
6703
6704 /* Compute the set of scalar stmts participating in internal and external
6705    nodes.  */
6706
6707 static void
6708 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6709                                          hash_set<slp_tree> &visited,
6710                                          hash_set<stmt_vec_info> &vstmts,
6711                                          hash_set<stmt_vec_info> &estmts)
6712 {
6713   int i;
6714   stmt_vec_info stmt_info;
6715   slp_tree child;
6716
6717   if (visited.add (node))
6718     return;
6719
6720   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6721     {
6722       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6723         vstmts.add (stmt_info);
6724
6725       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6726         if (child)
6727           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6728                                                    vstmts, estmts);
6729     }
6730   else
6731     for (tree def : SLP_TREE_SCALAR_OPS (node))
6732       {
6733         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6734         if (def_stmt)
6735           estmts.add (def_stmt);
6736       }
6737 }
6738
6739
6740 /* Compute the scalar cost of the SLP node NODE and its children
6741    and return it.  Do not account defs that are marked in LIFE and
6742    update LIFE according to uses of NODE.  */
6743
6744 static void
6745 vect_bb_slp_scalar_cost (vec_info *vinfo,
6746                          slp_tree node, vec<bool, va_heap> *life,
6747                          stmt_vector_for_cost *cost_vec,
6748                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6749                          hash_set<slp_tree> &visited)
6750 {
6751   unsigned i;
6752   stmt_vec_info stmt_info;
6753   slp_tree child;
6754
6755   if (visited.add (node))
6756     return;
6757
6758   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6759     {
6760       ssa_op_iter op_iter;
6761       def_operand_p def_p;
6762
6763       if ((*life)[i])
6764         continue;
6765
6766       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6767       gimple *orig_stmt = orig_stmt_info->stmt;
6768
6769       /* If there is a non-vectorized use of the defs then the scalar
6770          stmt is kept live in which case we do not account it or any
6771          required defs in the SLP children in the scalar cost.  This
6772          way we make the vectorization more costly when compared to
6773          the scalar cost.  */
6774       if (!STMT_VINFO_LIVE_P (stmt_info))
6775         {
6776           auto_vec<gimple *, 8> worklist;
6777           hash_set<gimple *> *worklist_visited = NULL;
6778           worklist.quick_push (orig_stmt);
6779           do
6780             {
6781               gimple *work_stmt = worklist.pop ();
6782               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6783                 {
6784                   imm_use_iterator use_iter;
6785                   gimple *use_stmt;
6786                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6787                                          DEF_FROM_PTR (def_p))
6788                     if (!is_gimple_debug (use_stmt))
6789                       {
6790                         stmt_vec_info use_stmt_info
6791                           = vinfo->lookup_stmt (use_stmt);
6792                         if (!use_stmt_info
6793                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6794                           {
6795                             if (use_stmt_info
6796                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6797                               {
6798                                 /* For stmts participating in patterns we have
6799                                    to check its uses recursively.  */
6800                                 if (!worklist_visited)
6801                                   worklist_visited = new hash_set<gimple *> ();
6802                                 if (!worklist_visited->add (use_stmt))
6803                                   worklist.safe_push (use_stmt);
6804                                 continue;
6805                               }
6806                             (*life)[i] = true;
6807                             goto next_lane;
6808                           }
6809                       }
6810                 }
6811             }
6812           while (!worklist.is_empty ());
6813 next_lane:
6814           if (worklist_visited)
6815             delete worklist_visited;
6816           if ((*life)[i])
6817             continue;
6818         }
6819
6820       /* Count scalar stmts only once.  */
6821       if (gimple_visited_p (orig_stmt))
6822         continue;
6823       gimple_set_visited (orig_stmt, true);
6824
6825       vect_cost_for_stmt kind;
6826       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6827         {
6828           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6829             kind = scalar_load;
6830           else
6831             kind = scalar_store;
6832         }
6833       else if (vect_nop_conversion_p (orig_stmt_info))
6834         continue;
6835       /* For single-argument PHIs assume coalescing which means zero cost
6836          for the scalar and the vector PHIs.  This avoids artificially
6837          favoring the vector path (but may pessimize it in some cases).  */
6838       else if (is_a <gphi *> (orig_stmt_info->stmt)
6839                && gimple_phi_num_args
6840                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6841         continue;
6842       else
6843         kind = scalar_stmt;
6844       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6845                         SLP_TREE_VECTYPE (node), 0, vect_body);
6846     }
6847
6848   auto_vec<bool, 20> subtree_life;
6849   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6850     {
6851       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6852         {
6853           /* Do not directly pass LIFE to the recursive call, copy it to
6854              confine changes in the callee to the current child/subtree.  */
6855           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6856             {
6857               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6858               for (unsigned j = 0;
6859                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6860                 {
6861                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6862                   if (perm.first == i)
6863                     subtree_life[perm.second] = (*life)[j];
6864                 }
6865             }
6866           else
6867             {
6868               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6869               subtree_life.safe_splice (*life);
6870             }
6871           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6872                                    vectorized_scalar_stmts, visited);
6873           subtree_life.truncate (0);
6874         }
6875     }
6876 }
6877
6878 /* Comparator for the loop-index sorted cost vectors.  */
6879
6880 static int
6881 li_cost_vec_cmp (const void *a_, const void *b_)
6882 {
6883   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6884   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6885   if (a->first < b->first)
6886     return -1;
6887   else if (a->first == b->first)
6888     return 0;
6889   return 1;
6890 }
6891
6892 /* Check if vectorization of the basic block is profitable for the
6893    subgraph denoted by SLP_INSTANCES.  */
6894
6895 static bool
6896 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6897                                     vec<slp_instance> slp_instances,
6898                                     loop_p orig_loop)
6899 {
6900   slp_instance instance;
6901   int i;
6902   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6903   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6904
6905   if (dump_enabled_p ())
6906     {
6907       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6908       hash_set<slp_tree> visited;
6909       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6910         vect_print_slp_graph (MSG_NOTE, vect_location,
6911                               SLP_INSTANCE_TREE (instance), visited);
6912     }
6913
6914   /* Compute the set of scalar stmts we know will go away 'locally' when
6915      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6916      not accurate for nodes promoted extern late or for scalar stmts that
6917      are used both in extern defs and in vectorized defs.  */
6918   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6919   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6920   hash_set<slp_tree> visited;
6921   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6922     {
6923       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6924                                                SLP_INSTANCE_TREE (instance),
6925                                                visited,
6926                                                vectorized_scalar_stmts,
6927                                                scalar_stmts_in_externs);
6928       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6929         vectorized_scalar_stmts.add (rstmt);
6930     }
6931   /* Scalar stmts used as defs in external nodes need to be preseved, so
6932      remove them from vectorized_scalar_stmts.  */
6933   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6934     vectorized_scalar_stmts.remove (stmt);
6935
6936   /* Calculate scalar cost and sum the cost for the vector stmts
6937      previously collected.  */
6938   stmt_vector_for_cost scalar_costs = vNULL;
6939   stmt_vector_for_cost vector_costs = vNULL;
6940   visited.empty ();
6941   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6942     {
6943       auto_vec<bool, 20> life;
6944       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6945                               true);
6946       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6947         record_stmt_cost (&scalar_costs,
6948                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6949                           scalar_stmt,
6950                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6951       vect_bb_slp_scalar_cost (bb_vinfo,
6952                                SLP_INSTANCE_TREE (instance),
6953                                &life, &scalar_costs, vectorized_scalar_stmts,
6954                                visited);
6955       vector_costs.safe_splice (instance->cost_vec);
6956       instance->cost_vec.release ();
6957     }
6958
6959   if (dump_enabled_p ())
6960     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6961
6962   /* When costing non-loop vectorization we need to consider each covered
6963      loop independently and make sure vectorization is profitable.  For
6964      now we assume a loop may be not entered or executed an arbitrary
6965      number of iterations (???  static information can provide more
6966      precise info here) which means we can simply cost each containing
6967      loops stmts separately.  */
6968
6969   /* First produce cost vectors sorted by loop index.  */
6970   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6971     li_scalar_costs (scalar_costs.length ());
6972   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6973     li_vector_costs (vector_costs.length ());
6974   stmt_info_for_cost *cost;
6975   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6976     {
6977       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6978       li_scalar_costs.quick_push (std::make_pair (l, cost));
6979     }
6980   /* Use a random used loop as fallback in case the first vector_costs
6981      entry does not have a stmt_info associated with it.  */
6982   unsigned l = li_scalar_costs[0].first;
6983   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6984     {
6985       /* We inherit from the previous COST, invariants, externals and
6986          extracts immediately follow the cost for the related stmt.  */
6987       if (cost->stmt_info)
6988         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6989       li_vector_costs.quick_push (std::make_pair (l, cost));
6990     }
6991   li_scalar_costs.qsort (li_cost_vec_cmp);
6992   li_vector_costs.qsort (li_cost_vec_cmp);
6993
6994   /* Now cost the portions individually.  */
6995   unsigned vi = 0;
6996   unsigned si = 0;
6997   bool profitable = true;
6998   while (si < li_scalar_costs.length ()
6999          && vi < li_vector_costs.length ())
7000     {
7001       unsigned sl = li_scalar_costs[si].first;
7002       unsigned vl = li_vector_costs[vi].first;
7003       if (sl != vl)
7004         {
7005           if (dump_enabled_p ())
7006             dump_printf_loc (MSG_NOTE, vect_location,
7007                              "Scalar %d and vector %d loop part do not "
7008                              "match up, skipping scalar part\n", sl, vl);
7009           /* Skip the scalar part, assuming zero cost on the vector side.  */
7010           do
7011             {
7012               si++;
7013             }
7014           while (si < li_scalar_costs.length ()
7015                  && li_scalar_costs[si].first == sl);
7016           continue;
7017         }
7018
7019       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7020       do
7021         {
7022           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7023           si++;
7024         }
7025       while (si < li_scalar_costs.length ()
7026              && li_scalar_costs[si].first == sl);
7027       unsigned dummy;
7028       finish_cost (scalar_target_cost_data, nullptr,
7029                    &dummy, &scalar_cost, &dummy);
7030
7031       /* Complete the target-specific vector cost calculation.  */
7032       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7033       do
7034         {
7035           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7036           vi++;
7037         }
7038       while (vi < li_vector_costs.length ()
7039              && li_vector_costs[vi].first == vl);
7040       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7041                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7042       delete scalar_target_cost_data;
7043       delete vect_target_cost_data;
7044
7045       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7046
7047       if (dump_enabled_p ())
7048         {
7049           dump_printf_loc (MSG_NOTE, vect_location,
7050                            "Cost model analysis for part in loop %d:\n", sl);
7051           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7052                        vec_inside_cost + vec_outside_cost);
7053           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7054         }
7055
7056       /* Vectorization is profitable if its cost is more than the cost of scalar
7057          version.  Note that we err on the vector side for equal cost because
7058          the cost estimate is otherwise quite pessimistic (constant uses are
7059          free on the scalar side but cost a load on the vector side for
7060          example).  */
7061       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7062         {
7063           profitable = false;
7064           break;
7065         }
7066     }
7067   if (profitable && vi < li_vector_costs.length ())
7068     {
7069       if (dump_enabled_p ())
7070         dump_printf_loc (MSG_NOTE, vect_location,
7071                          "Excess vector cost for part in loop %d:\n",
7072                          li_vector_costs[vi].first);
7073       profitable = false;
7074     }
7075
7076   /* Unset visited flag.  This is delayed when the subgraph is profitable
7077      and we process the loop for remaining unvectorized if-converted code.  */
7078   if (!orig_loop || !profitable)
7079     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7080       gimple_set_visited  (cost->stmt_info->stmt, false);
7081
7082   scalar_costs.release ();
7083   vector_costs.release ();
7084
7085   return profitable;
7086 }
7087
7088 /* qsort comparator for lane defs.  */
7089
7090 static int
7091 vld_cmp (const void *a_, const void *b_)
7092 {
7093   auto *a = (const std::pair<unsigned, tree> *)a_;
7094   auto *b = (const std::pair<unsigned, tree> *)b_;
7095   return a->first - b->first;
7096 }
7097
7098 /* Return true if USE_STMT is a vector lane insert into VEC and set
7099    *THIS_LANE to the lane number that is set.  */
7100
7101 static bool
7102 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7103 {
7104   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7105   if (!use_ass
7106       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7107       || (vec
7108           ? gimple_assign_rhs1 (use_ass) != vec
7109           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7110       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7111                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7112       || !constant_multiple_p
7113             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7114              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7115              this_lane))
7116     return false;
7117   return true;
7118 }
7119
7120 /* Find any vectorizable constructors and add them to the grouped_store
7121    array.  */
7122
7123 static void
7124 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7125 {
7126   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7127     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7128          !gsi_end_p (gsi); gsi_next (&gsi))
7129     {
7130       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7131       if (!assign)
7132         continue;
7133
7134       tree rhs = gimple_assign_rhs1 (assign);
7135       enum tree_code code = gimple_assign_rhs_code (assign);
7136       use_operand_p use_p;
7137       gimple *use_stmt;
7138       if (code == CONSTRUCTOR)
7139         {
7140           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7141               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7142                            CONSTRUCTOR_NELTS (rhs))
7143               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7144               || uniform_vector_p (rhs))
7145             continue;
7146
7147           unsigned j;
7148           tree val;
7149           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7150             if (TREE_CODE (val) != SSA_NAME
7151                 || !bb_vinfo->lookup_def (val))
7152               break;
7153           if (j != CONSTRUCTOR_NELTS (rhs))
7154             continue;
7155
7156           vec<stmt_vec_info> roots = vNULL;
7157           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7158           vec<stmt_vec_info> stmts;
7159           stmts.create (CONSTRUCTOR_NELTS (rhs));
7160           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7161             stmts.quick_push
7162               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7163           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7164                                                stmts, roots));
7165         }
7166       else if (code == BIT_INSERT_EXPR
7167                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7168                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7169                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7170                && integer_zerop (gimple_assign_rhs3 (assign))
7171                && useless_type_conversion_p
7172                     (TREE_TYPE (TREE_TYPE (rhs)),
7173                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7174                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7175         {
7176           /* We start to match on insert to lane zero but since the
7177              inserts need not be ordered we'd have to search both
7178              the def and the use chains.  */
7179           tree vectype = TREE_TYPE (rhs);
7180           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7181           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7182           auto_sbitmap lanes (nlanes);
7183           bitmap_clear (lanes);
7184           bitmap_set_bit (lanes, 0);
7185           tree def = gimple_assign_lhs (assign);
7186           lane_defs.quick_push
7187                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7188           unsigned lanes_found = 1;
7189           /* Start with the use chains, the last stmt will be the root.  */
7190           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7191           vec<stmt_vec_info> roots = vNULL;
7192           roots.safe_push (last);
7193           do
7194             {
7195               use_operand_p use_p;
7196               gimple *use_stmt;
7197               if (!single_imm_use (def, &use_p, &use_stmt))
7198                 break;
7199               unsigned this_lane;
7200               if (!bb_vinfo->lookup_stmt (use_stmt)
7201                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7202                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7203                 break;
7204               if (bitmap_bit_p (lanes, this_lane))
7205                 break;
7206               lanes_found++;
7207               bitmap_set_bit (lanes, this_lane);
7208               gassign *use_ass = as_a <gassign *> (use_stmt);
7209               lane_defs.quick_push (std::make_pair
7210                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7211               last = bb_vinfo->lookup_stmt (use_ass);
7212               roots.safe_push (last);
7213               def = gimple_assign_lhs (use_ass);
7214             }
7215           while (lanes_found < nlanes);
7216           if (roots.length () > 1)
7217             std::swap(roots[0], roots[roots.length () - 1]);
7218           if (lanes_found < nlanes)
7219             {
7220               /* Now search the def chain.  */
7221               def = gimple_assign_rhs1 (assign);
7222               do
7223                 {
7224                   if (TREE_CODE (def) != SSA_NAME
7225                       || !has_single_use (def))
7226                     break;
7227                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7228                   unsigned this_lane;
7229                   if (!bb_vinfo->lookup_stmt (def_stmt)
7230                       || !vect_slp_is_lane_insert (def_stmt,
7231                                                    NULL_TREE, &this_lane)
7232                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7233                     break;
7234                   if (bitmap_bit_p (lanes, this_lane))
7235                     break;
7236                   lanes_found++;
7237                   bitmap_set_bit (lanes, this_lane);
7238                   lane_defs.quick_push (std::make_pair
7239                                           (this_lane,
7240                                            gimple_assign_rhs2 (def_stmt)));
7241                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7242                   def = gimple_assign_rhs1 (def_stmt);
7243                 }
7244               while (lanes_found < nlanes);
7245             }
7246           if (lanes_found == nlanes)
7247             {
7248               /* Sort lane_defs after the lane index and register the root.  */
7249               lane_defs.qsort (vld_cmp);
7250               vec<stmt_vec_info> stmts;
7251               stmts.create (nlanes);
7252               for (unsigned i = 0; i < nlanes; ++i)
7253                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7254               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7255                                                    stmts, roots));
7256             }
7257           else
7258             roots.release ();
7259         }
7260       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7261                && (associative_tree_code (code) || code == MINUS_EXPR)
7262                /* ???  This pessimizes a two-element reduction.  PR54400.
7263                   ???  In-order reduction could be handled if we only
7264                   traverse one operand chain in vect_slp_linearize_chain.  */
7265                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7266                /* Ops with constants at the tail can be stripped here.  */
7267                && TREE_CODE (rhs) == SSA_NAME
7268                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7269                /* Should be the chain end.  */
7270                && (!single_imm_use (gimple_assign_lhs (assign),
7271                                     &use_p, &use_stmt)
7272                    || !is_gimple_assign (use_stmt)
7273                    || (gimple_assign_rhs_code (use_stmt) != code
7274                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7275                            || (gimple_assign_rhs_code (use_stmt)
7276                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7277         {
7278           /* We start the match at the end of a possible association
7279              chain.  */
7280           auto_vec<chain_op_t> chain;
7281           auto_vec<std::pair<tree_code, gimple *> > worklist;
7282           auto_vec<gimple *> chain_stmts;
7283           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7284           if (code == MINUS_EXPR)
7285             code = PLUS_EXPR;
7286           internal_fn reduc_fn;
7287           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7288               || reduc_fn == IFN_LAST)
7289             continue;
7290           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7291                                     /* ??? */
7292                                     code_stmt, alt_code_stmt, &chain_stmts);
7293           if (chain.length () > 1)
7294             {
7295               /* Sort the chain according to def_type and operation.  */
7296               chain.sort (dt_sort_cmp, bb_vinfo);
7297               /* ???  Now we'd want to strip externals and constants
7298                  but record those to be handled in the epilogue.  */
7299               /* ???  For now do not allow mixing ops or externs/constants.  */
7300               bool invalid = false;
7301               unsigned remain_cnt = 0;
7302               for (unsigned i = 0; i < chain.length (); ++i)
7303                 {
7304                   if (chain[i].code != code)
7305                     {
7306                       invalid = true;
7307                       break;
7308                     }
7309                   if (chain[i].dt != vect_internal_def)
7310                     remain_cnt++;
7311                 }
7312               if (!invalid && chain.length () - remain_cnt > 1)
7313                 {
7314                   vec<stmt_vec_info> stmts;
7315                   vec<tree> remain = vNULL;
7316                   stmts.create (chain.length ());
7317                   if (remain_cnt > 0)
7318                     remain.create (remain_cnt);
7319                   for (unsigned i = 0; i < chain.length (); ++i)
7320                     {
7321                       if (chain[i].dt == vect_internal_def)
7322                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7323                       else
7324                         remain.quick_push (chain[i].op);
7325                     }
7326                   vec<stmt_vec_info> roots;
7327                   roots.create (chain_stmts.length ());
7328                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7329                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7330                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7331                                                        stmts, roots, remain));
7332                 }
7333             }
7334         }
7335     }
7336 }
7337
7338 /* Walk the grouped store chains and replace entries with their
7339    pattern variant if any.  */
7340
7341 static void
7342 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7343 {
7344   stmt_vec_info first_element;
7345   unsigned i;
7346
7347   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7348     {
7349       /* We also have CTORs in this array.  */
7350       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7351         continue;
7352       if (STMT_VINFO_IN_PATTERN_P (first_element))
7353         {
7354           stmt_vec_info orig = first_element;
7355           first_element = STMT_VINFO_RELATED_STMT (first_element);
7356           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7357           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7358           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7359           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7360           vinfo->grouped_stores[i] = first_element;
7361         }
7362       stmt_vec_info prev = first_element;
7363       while (DR_GROUP_NEXT_ELEMENT (prev))
7364         {
7365           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7366           if (STMT_VINFO_IN_PATTERN_P (elt))
7367             {
7368               stmt_vec_info orig = elt;
7369               elt = STMT_VINFO_RELATED_STMT (elt);
7370               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7371               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7372               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7373             }
7374           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7375           prev = elt;
7376         }
7377     }
7378 }
7379
7380 /* Check if the region described by BB_VINFO can be vectorized, returning
7381    true if so.  When returning false, set FATAL to true if the same failure
7382    would prevent vectorization at other vector sizes, false if it is still
7383    worth trying other sizes.  N_STMTS is the number of statements in the
7384    region.  */
7385
7386 static bool
7387 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7388                        vec<int> *dataref_groups)
7389 {
7390   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7391
7392   slp_instance instance;
7393   int i;
7394   poly_uint64 min_vf = 2;
7395
7396   /* The first group of checks is independent of the vector size.  */
7397   fatal = true;
7398
7399   /* Analyze the data references.  */
7400
7401   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7402     {
7403       if (dump_enabled_p ())
7404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7405                          "not vectorized: unhandled data-ref in basic "
7406                          "block.\n");
7407       return false;
7408     }
7409
7410   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7411     {
7412      if (dump_enabled_p ())
7413        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7414                         "not vectorized: unhandled data access in "
7415                         "basic block.\n");
7416       return false;
7417     }
7418
7419   vect_slp_check_for_roots (bb_vinfo);
7420
7421   /* If there are no grouped stores and no constructors in the region
7422      there is no need to continue with pattern recog as vect_analyze_slp
7423      will fail anyway.  */
7424   if (bb_vinfo->grouped_stores.is_empty ()
7425       && bb_vinfo->roots.is_empty ())
7426     {
7427       if (dump_enabled_p ())
7428         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7429                          "not vectorized: no grouped stores in "
7430                          "basic block.\n");
7431       return false;
7432     }
7433
7434   /* While the rest of the analysis below depends on it in some way.  */
7435   fatal = false;
7436
7437   vect_pattern_recog (bb_vinfo);
7438
7439   /* Update store groups from pattern processing.  */
7440   vect_fixup_store_groups_with_patterns (bb_vinfo);
7441
7442   /* Check the SLP opportunities in the basic block, analyze and build SLP
7443      trees.  */
7444   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7445     {
7446       if (dump_enabled_p ())
7447         {
7448           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7449                            "Failed to SLP the basic block.\n");
7450           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7451                            "not vectorized: failed to find SLP opportunities "
7452                            "in basic block.\n");
7453         }
7454       return false;
7455     }
7456
7457   /* Optimize permutations.  */
7458   vect_optimize_slp (bb_vinfo);
7459
7460   /* Gather the loads reachable from the SLP graph entries.  */
7461   vect_gather_slp_loads (bb_vinfo);
7462
7463   vect_record_base_alignments (bb_vinfo);
7464
7465   /* Analyze and verify the alignment of data references and the
7466      dependence in the SLP instances.  */
7467   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7468     {
7469       vect_location = instance->location ();
7470       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7471           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7472         {
7473           slp_tree node = SLP_INSTANCE_TREE (instance);
7474           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7475           if (dump_enabled_p ())
7476             dump_printf_loc (MSG_NOTE, vect_location,
7477                              "removing SLP instance operations starting from: %G",
7478                              stmt_info->stmt);
7479           vect_free_slp_instance (instance);
7480           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7481           continue;
7482         }
7483
7484       /* Mark all the statements that we want to vectorize as pure SLP and
7485          relevant.  */
7486       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7487       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7488       unsigned j;
7489       stmt_vec_info root;
7490       /* Likewise consider instance root stmts as vectorized.  */
7491       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7492         STMT_SLP_TYPE (root) = pure_slp;
7493
7494       i++;
7495     }
7496   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7497     return false;
7498
7499   if (!vect_slp_analyze_operations (bb_vinfo))
7500     {
7501       if (dump_enabled_p ())
7502         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7503                          "not vectorized: bad operation in basic block.\n");
7504       return false;
7505     }
7506
7507   vect_bb_partition_graph (bb_vinfo);
7508
7509   return true;
7510 }
7511
7512 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7513    basic blocks in BBS, returning true on success.
7514    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7515
7516 static bool
7517 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7518                  vec<int> *dataref_groups, unsigned int n_stmts,
7519                  loop_p orig_loop)
7520 {
7521   bb_vec_info bb_vinfo;
7522   auto_vector_modes vector_modes;
7523
7524   /* Autodetect first vector size we try.  */
7525   machine_mode next_vector_mode = VOIDmode;
7526   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7527   unsigned int mode_i = 0;
7528
7529   vec_info_shared shared;
7530
7531   machine_mode autodetected_vector_mode = VOIDmode;
7532   while (1)
7533     {
7534       bool vectorized = false;
7535       bool fatal = false;
7536       bb_vinfo = new _bb_vec_info (bbs, &shared);
7537
7538       bool first_time_p = shared.datarefs.is_empty ();
7539       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7540       if (first_time_p)
7541         bb_vinfo->shared->save_datarefs ();
7542       else
7543         bb_vinfo->shared->check_datarefs ();
7544       bb_vinfo->vector_mode = next_vector_mode;
7545
7546       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7547         {
7548           if (dump_enabled_p ())
7549             {
7550               dump_printf_loc (MSG_NOTE, vect_location,
7551                                "***** Analysis succeeded with vector mode"
7552                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7553               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7554             }
7555
7556           bb_vinfo->shared->check_datarefs ();
7557
7558           auto_vec<slp_instance> profitable_subgraphs;
7559           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7560             {
7561               if (instance->subgraph_entries.is_empty ())
7562                 continue;
7563
7564               dump_user_location_t saved_vect_location = vect_location;
7565               vect_location = instance->location ();
7566               if (!unlimited_cost_model (NULL)
7567                   && !vect_bb_vectorization_profitable_p
7568                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7569                 {
7570                   if (dump_enabled_p ())
7571                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572                                      "not vectorized: vectorization is not "
7573                                      "profitable.\n");
7574                   vect_location = saved_vect_location;
7575                   continue;
7576                 }
7577
7578               vect_location = saved_vect_location;
7579               if (!dbg_cnt (vect_slp))
7580                 continue;
7581
7582               profitable_subgraphs.safe_push (instance);
7583             }
7584
7585           /* When we're vectorizing an if-converted loop body make sure
7586              we vectorized all if-converted code.  */
7587           if (!profitable_subgraphs.is_empty ()
7588               && orig_loop)
7589             {
7590               gcc_assert (bb_vinfo->bbs.length () == 1);
7591               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7592                    !gsi_end_p (gsi); gsi_next (&gsi))
7593                 {
7594                   /* The costing above left us with DCEable vectorized scalar
7595                      stmts having the visited flag set on profitable
7596                      subgraphs.  Do the delayed clearing of the flag here.  */
7597                   if (gimple_visited_p (gsi_stmt (gsi)))
7598                     {
7599                       gimple_set_visited (gsi_stmt (gsi), false);
7600                       continue;
7601                     }
7602                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7603                     continue;
7604
7605                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7606                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7607                       {
7608                         if (!profitable_subgraphs.is_empty ()
7609                             && dump_enabled_p ())
7610                           dump_printf_loc (MSG_NOTE, vect_location,
7611                                            "not profitable because of "
7612                                            "unprofitable if-converted scalar "
7613                                            "code\n");
7614                         profitable_subgraphs.truncate (0);
7615                       }
7616                 }
7617             }
7618
7619           /* Finally schedule the profitable subgraphs.  */
7620           for (slp_instance instance : profitable_subgraphs)
7621             {
7622               if (!vectorized && dump_enabled_p ())
7623                 dump_printf_loc (MSG_NOTE, vect_location,
7624                                  "Basic block will be vectorized "
7625                                  "using SLP\n");
7626               vectorized = true;
7627
7628               /* Dump before scheduling as store vectorization will remove
7629                  the original stores and mess with the instance tree
7630                  so querying its location will eventually ICE.  */
7631               if (flag_checking)
7632                 for (slp_instance sub : instance->subgraph_entries)
7633                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7634               unsigned HOST_WIDE_INT bytes;
7635               if (dump_enabled_p ())
7636                 for (slp_instance sub : instance->subgraph_entries)
7637                   {
7638                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7639                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7640                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7641                                        sub->location (),
7642                                        "basic block part vectorized using %wu "
7643                                        "byte vectors\n", bytes);
7644                     else
7645                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7646                                        sub->location (),
7647                                        "basic block part vectorized using "
7648                                        "variable length vectors\n");
7649                   }
7650
7651               dump_user_location_t saved_vect_location = vect_location;
7652               vect_location = instance->location ();
7653
7654               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7655
7656               vect_location = saved_vect_location;
7657             }
7658         }
7659       else
7660         {
7661           if (dump_enabled_p ())
7662             dump_printf_loc (MSG_NOTE, vect_location,
7663                              "***** Analysis failed with vector mode %s\n",
7664                              GET_MODE_NAME (bb_vinfo->vector_mode));
7665         }
7666
7667       if (mode_i == 0)
7668         autodetected_vector_mode = bb_vinfo->vector_mode;
7669
7670       if (!fatal)
7671         while (mode_i < vector_modes.length ()
7672                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7673           {
7674             if (dump_enabled_p ())
7675               dump_printf_loc (MSG_NOTE, vect_location,
7676                                "***** The result for vector mode %s would"
7677                                " be the same\n",
7678                                GET_MODE_NAME (vector_modes[mode_i]));
7679             mode_i += 1;
7680           }
7681
7682       delete bb_vinfo;
7683
7684       if (mode_i < vector_modes.length ()
7685           && VECTOR_MODE_P (autodetected_vector_mode)
7686           && (related_vector_mode (vector_modes[mode_i],
7687                                    GET_MODE_INNER (autodetected_vector_mode))
7688               == autodetected_vector_mode)
7689           && (related_vector_mode (autodetected_vector_mode,
7690                                    GET_MODE_INNER (vector_modes[mode_i]))
7691               == vector_modes[mode_i]))
7692         {
7693           if (dump_enabled_p ())
7694             dump_printf_loc (MSG_NOTE, vect_location,
7695                              "***** Skipping vector mode %s, which would"
7696                              " repeat the analysis for %s\n",
7697                              GET_MODE_NAME (vector_modes[mode_i]),
7698                              GET_MODE_NAME (autodetected_vector_mode));
7699           mode_i += 1;
7700         }
7701
7702       if (vectorized
7703           || mode_i == vector_modes.length ()
7704           || autodetected_vector_mode == VOIDmode
7705           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7706              vector sizes will fail do not bother iterating.  */
7707           || fatal)
7708         return vectorized;
7709
7710       /* Try the next biggest vector size.  */
7711       next_vector_mode = vector_modes[mode_i++];
7712       if (dump_enabled_p ())
7713         dump_printf_loc (MSG_NOTE, vect_location,
7714                          "***** Re-trying analysis with vector mode %s\n",
7715                          GET_MODE_NAME (next_vector_mode));
7716     }
7717 }
7718
7719
7720 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7721    true if anything in the basic-block was vectorized.  */
7722
7723 static bool
7724 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7725 {
7726   vec<data_reference_p> datarefs = vNULL;
7727   auto_vec<int> dataref_groups;
7728   int insns = 0;
7729   int current_group = 0;
7730
7731   for (unsigned i = 0; i < bbs.length (); i++)
7732     {
7733       basic_block bb = bbs[i];
7734       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7735            gsi_next (&gsi))
7736         {
7737           gimple *stmt = gsi_stmt (gsi);
7738           if (is_gimple_debug (stmt))
7739             continue;
7740
7741           insns++;
7742
7743           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7744             vect_location = stmt;
7745
7746           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7747                                               &dataref_groups, current_group))
7748             ++current_group;
7749         }
7750       /* New BBs always start a new DR group.  */
7751       ++current_group;
7752     }
7753
7754   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7755 }
7756
7757 /* Special entry for the BB vectorizer.  Analyze and transform a single
7758    if-converted BB with ORIG_LOOPs body being the not if-converted
7759    representation.  Returns true if anything in the basic-block was
7760    vectorized.  */
7761
7762 bool
7763 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7764 {
7765   auto_vec<basic_block> bbs;
7766   bbs.safe_push (bb);
7767   return vect_slp_bbs (bbs, orig_loop);
7768 }
7769
7770 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7771    true if anything in the basic-block was vectorized.  */
7772
7773 bool
7774 vect_slp_function (function *fun)
7775 {
7776   bool r = false;
7777   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7778   auto_bitmap exit_bbs;
7779   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7780   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7781   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7782                                                       true, rpo, NULL);
7783
7784   /* For the moment split the function into pieces to avoid making
7785      the iteration on the vector mode moot.  Split at points we know
7786      to not handle well which is CFG merges (SLP discovery doesn't
7787      handle non-loop-header PHIs) and loop exits.  Since pattern
7788      recog requires reverse iteration to visit uses before defs
7789      simply chop RPO into pieces.  */
7790   auto_vec<basic_block> bbs;
7791   for (unsigned i = 0; i < n; i++)
7792     {
7793       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7794       bool split = false;
7795
7796       /* Split when a BB is not dominated by the first block.  */
7797       if (!bbs.is_empty ()
7798           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7799         {
7800           if (dump_enabled_p ())
7801             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802                              "splitting region at dominance boundary bb%d\n",
7803                              bb->index);
7804           split = true;
7805         }
7806       /* Split when the loop determined by the first block
7807          is exited.  This is because we eventually insert
7808          invariants at region begin.  */
7809       else if (!bbs.is_empty ()
7810                && bbs[0]->loop_father != bb->loop_father
7811                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7812         {
7813           if (dump_enabled_p ())
7814             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815                              "splitting region at loop %d exit at bb%d\n",
7816                              bbs[0]->loop_father->num, bb->index);
7817           split = true;
7818         }
7819       else if (!bbs.is_empty ()
7820                && bb->loop_father->header == bb
7821                && bb->loop_father->dont_vectorize)
7822         {
7823           if (dump_enabled_p ())
7824             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7825                              "splitting region at dont-vectorize loop %d "
7826                              "entry at bb%d\n",
7827                              bb->loop_father->num, bb->index);
7828           split = true;
7829         }
7830
7831       if (split && !bbs.is_empty ())
7832         {
7833           r |= vect_slp_bbs (bbs, NULL);
7834           bbs.truncate (0);
7835         }
7836
7837       if (bbs.is_empty ())
7838         {
7839           /* We need to be able to insert at the head of the region which
7840              we cannot for region starting with a returns-twice call.  */
7841           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7842             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7843               {
7844                 if (dump_enabled_p ())
7845                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846                                    "skipping bb%d as start of region as it "
7847                                    "starts with returns-twice call\n",
7848                                    bb->index);
7849                 continue;
7850               }
7851           /* If the loop this BB belongs to is marked as not to be vectorized
7852              honor that also for BB vectorization.  */
7853           if (bb->loop_father->dont_vectorize)
7854             continue;
7855         }
7856
7857       bbs.safe_push (bb);
7858
7859       /* When we have a stmt ending this block and defining a
7860          value we have to insert on edges when inserting after it for
7861          a vector containing its definition.  Avoid this for now.  */
7862       if (gimple *last = *gsi_last_bb (bb))
7863         if (gimple_get_lhs (last)
7864             && is_ctrl_altering_stmt (last))
7865           {
7866             if (dump_enabled_p ())
7867               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7868                                "splitting region at control altering "
7869                                "definition %G", last);
7870             r |= vect_slp_bbs (bbs, NULL);
7871             bbs.truncate (0);
7872           }
7873     }
7874
7875   if (!bbs.is_empty ())
7876     r |= vect_slp_bbs (bbs, NULL);
7877
7878   free (rpo);
7879
7880   return r;
7881 }
7882
7883 /* Build a variable-length vector in which the elements in ELTS are repeated
7884    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7885    RESULTS and add any new instructions to SEQ.
7886
7887    The approach we use is:
7888
7889    (1) Find a vector mode VM with integer elements of mode IM.
7890
7891    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7892        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7893        from small vectors to IM.
7894
7895    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7896
7897    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7898        correct byte contents.
7899
7900    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7901
7902    We try to find the largest IM for which this sequence works, in order
7903    to cut down on the number of interleaves.  */
7904
7905 void
7906 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7907                           const vec<tree> &elts, unsigned int nresults,
7908                           vec<tree> &results)
7909 {
7910   unsigned int nelts = elts.length ();
7911   tree element_type = TREE_TYPE (vector_type);
7912
7913   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7914   unsigned int nvectors = 1;
7915   tree new_vector_type;
7916   tree permutes[2];
7917   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7918                                        &nvectors, &new_vector_type,
7919                                        permutes))
7920     gcc_unreachable ();
7921
7922   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7923   unsigned int partial_nelts = nelts / nvectors;
7924   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7925
7926   tree_vector_builder partial_elts;
7927   auto_vec<tree, 32> pieces (nvectors * 2);
7928   pieces.quick_grow_cleared (nvectors * 2);
7929   for (unsigned int i = 0; i < nvectors; ++i)
7930     {
7931       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7932              ELTS' has mode IM.  */
7933       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7934       for (unsigned int j = 0; j < partial_nelts; ++j)
7935         partial_elts.quick_push (elts[i * partial_nelts + j]);
7936       tree t = gimple_build_vector (seq, &partial_elts);
7937       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7938                         TREE_TYPE (new_vector_type), t);
7939
7940       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7941       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7942     }
7943
7944   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7945          correct byte contents.
7946
7947      Conceptually, we need to repeat the following operation log2(nvectors)
7948      times, where hi_start = nvectors / 2:
7949
7950         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7951         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7952
7953      However, if each input repeats every N elements and the VF is
7954      a multiple of N * 2, the HI result is the same as the LO result.
7955      This will be true for the first N1 iterations of the outer loop,
7956      followed by N2 iterations for which both the LO and HI results
7957      are needed.  I.e.:
7958
7959         N1 + N2 = log2(nvectors)
7960
7961      Each "N1 iteration" doubles the number of redundant vectors and the
7962      effect of the process as a whole is to have a sequence of nvectors/2**N1
7963      vectors that repeats 2**N1 times.  Rather than generate these redundant
7964      vectors, we halve the number of vectors for each N1 iteration.  */
7965   unsigned int in_start = 0;
7966   unsigned int out_start = nvectors;
7967   unsigned int new_nvectors = nvectors;
7968   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7969     {
7970       unsigned int hi_start = new_nvectors / 2;
7971       unsigned int out_i = 0;
7972       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7973         {
7974           if ((in_i & 1) != 0
7975               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7976                              2 * in_repeat))
7977             continue;
7978
7979           tree output = make_ssa_name (new_vector_type);
7980           tree input1 = pieces[in_start + (in_i / 2)];
7981           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7982           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7983                                                input1, input2,
7984                                                permutes[in_i & 1]);
7985           gimple_seq_add_stmt (seq, stmt);
7986           pieces[out_start + out_i] = output;
7987           out_i += 1;
7988         }
7989       std::swap (in_start, out_start);
7990       new_nvectors = out_i;
7991     }
7992
7993   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7994   results.reserve (nresults);
7995   for (unsigned int i = 0; i < nresults; ++i)
7996     if (i < new_nvectors)
7997       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7998                                         pieces[in_start + i]));
7999     else
8000       results.quick_push (results[i - new_nvectors]);
8001 }
8002
8003
8004 /* For constant and loop invariant defs in OP_NODE this function creates
8005    vector defs that will be used in the vectorized stmts and stores them
8006    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8007
8008 static void
8009 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8010 {
8011   unsigned HOST_WIDE_INT nunits;
8012   tree vec_cst;
8013   unsigned j, number_of_places_left_in_vector;
8014   tree vector_type;
8015   tree vop;
8016   int group_size = op_node->ops.length ();
8017   unsigned int vec_num, i;
8018   unsigned number_of_copies = 1;
8019   bool constant_p;
8020   gimple_seq ctor_seq = NULL;
8021   auto_vec<tree, 16> permute_results;
8022
8023   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8024   vector_type = SLP_TREE_VECTYPE (op_node);
8025
8026   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8027   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8028   auto_vec<tree> voprnds (number_of_vectors);
8029
8030   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8031      created vectors. It is greater than 1 if unrolling is performed.
8032
8033      For example, we have two scalar operands, s1 and s2 (e.g., group of
8034      strided accesses of size two), while NUNITS is four (i.e., four scalars
8035      of this type can be packed in a vector).  The output vector will contain
8036      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8037      will be 2).
8038
8039      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8040      containing the operands.
8041
8042      For example, NUNITS is four as before, and the group size is 8
8043      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8044      {s5, s6, s7, s8}.  */
8045
8046   /* When using duplicate_and_interleave, we just need one element for
8047      each scalar statement.  */
8048   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8049     nunits = group_size;
8050
8051   number_of_copies = nunits * number_of_vectors / group_size;
8052
8053   number_of_places_left_in_vector = nunits;
8054   constant_p = true;
8055   tree_vector_builder elts (vector_type, nunits, 1);
8056   elts.quick_grow (nunits);
8057   stmt_vec_info insert_after = NULL;
8058   for (j = 0; j < number_of_copies; j++)
8059     {
8060       tree op;
8061       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8062         {
8063           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8064           number_of_places_left_in_vector--;
8065           tree orig_op = op;
8066           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8067             {
8068               if (CONSTANT_CLASS_P (op))
8069                 {
8070                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8071                     {
8072                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8073                          of possibly different sizes of scalar value and
8074                          vector element.  */
8075                       if (integer_zerop (op))
8076                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8077                       else if (integer_onep (op))
8078                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8079                       else
8080                         gcc_unreachable ();
8081                     }
8082                   else
8083                     op = fold_unary (VIEW_CONVERT_EXPR,
8084                                      TREE_TYPE (vector_type), op);
8085                   gcc_assert (op && CONSTANT_CLASS_P (op));
8086                 }
8087               else
8088                 {
8089                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8090                   gimple *init_stmt;
8091                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8092                     {
8093                       tree true_val
8094                         = build_all_ones_cst (TREE_TYPE (vector_type));
8095                       tree false_val
8096                         = build_zero_cst (TREE_TYPE (vector_type));
8097                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8098                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8099                                                        op, true_val,
8100                                                        false_val);
8101                     }
8102                   else
8103                     {
8104                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8105                                    op);
8106                       init_stmt
8107                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8108                                                op);
8109                     }
8110                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8111                   op = new_temp;
8112                 }
8113             }
8114           elts[number_of_places_left_in_vector] = op;
8115           if (!CONSTANT_CLASS_P (op))
8116             constant_p = false;
8117           /* For BB vectorization we have to compute an insert location
8118              when a def is inside the analyzed region since we cannot
8119              simply insert at the BB start in this case.  */
8120           stmt_vec_info opdef;
8121           if (TREE_CODE (orig_op) == SSA_NAME
8122               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8123               && is_a <bb_vec_info> (vinfo)
8124               && (opdef = vinfo->lookup_def (orig_op)))
8125             {
8126               if (!insert_after)
8127                 insert_after = opdef;
8128               else
8129                 insert_after = get_later_stmt (insert_after, opdef);
8130             }
8131
8132           if (number_of_places_left_in_vector == 0)
8133             {
8134               if (constant_p
8135                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8136                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8137                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8138               else
8139                 {
8140                   if (permute_results.is_empty ())
8141                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8142                                               elts, number_of_vectors,
8143                                               permute_results);
8144                   vec_cst = permute_results[number_of_vectors - j - 1];
8145                 }
8146               if (!gimple_seq_empty_p (ctor_seq))
8147                 {
8148                   if (insert_after)
8149                     {
8150                       gimple_stmt_iterator gsi;
8151                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8152                         {
8153                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8154                           gsi_insert_seq_before (&gsi, ctor_seq,
8155                                                  GSI_CONTINUE_LINKING);
8156                         }
8157                       else if (!stmt_ends_bb_p (insert_after->stmt))
8158                         {
8159                           gsi = gsi_for_stmt (insert_after->stmt);
8160                           gsi_insert_seq_after (&gsi, ctor_seq,
8161                                                 GSI_CONTINUE_LINKING);
8162                         }
8163                       else
8164                         {
8165                           /* When we want to insert after a def where the
8166                              defining stmt throws then insert on the fallthru
8167                              edge.  */
8168                           edge e = find_fallthru_edge
8169                                      (gimple_bb (insert_after->stmt)->succs);
8170                           basic_block new_bb
8171                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8172                           gcc_assert (!new_bb);
8173                         }
8174                     }
8175                   else
8176                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8177                   ctor_seq = NULL;
8178                 }
8179               voprnds.quick_push (vec_cst);
8180               insert_after = NULL;
8181               number_of_places_left_in_vector = nunits;
8182               constant_p = true;
8183               elts.new_vector (vector_type, nunits, 1);
8184               elts.quick_grow (nunits);
8185             }
8186         }
8187     }
8188
8189   /* Since the vectors are created in the reverse order, we should invert
8190      them.  */
8191   vec_num = voprnds.length ();
8192   for (j = vec_num; j != 0; j--)
8193     {
8194       vop = voprnds[j - 1];
8195       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8196     }
8197
8198   /* In case that VF is greater than the unrolling factor needed for the SLP
8199      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8200      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8201      to replicate the vectors.  */
8202   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8203     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8204          i++)
8205       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8206 }
8207
8208 /* Get the Ith vectorized definition from SLP_NODE.  */
8209
8210 tree
8211 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8212 {
8213   return SLP_TREE_VEC_DEFS (slp_node)[i];
8214 }
8215
8216 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8217
8218 void
8219 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8220 {
8221   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8222   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8223 }
8224
8225 /* Get N vectorized definitions for SLP_NODE.  */
8226
8227 void
8228 vect_get_slp_defs (vec_info *,
8229                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8230 {
8231   if (n == -1U)
8232     n = SLP_TREE_CHILDREN (slp_node).length ();
8233
8234   for (unsigned i = 0; i < n; ++i)
8235     {
8236       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8237       vec<tree> vec_defs = vNULL;
8238       vect_get_slp_defs (child, &vec_defs);
8239       vec_oprnds->quick_push (vec_defs);
8240     }
8241 }
8242
8243 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8244    - PERM gives the permutation that the caller wants to use for NODE,
8245      which might be different from SLP_LOAD_PERMUTATION.
8246    - DUMP_P controls whether the function dumps information.  */
8247
8248 static bool
8249 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8250                                 load_permutation_t &perm,
8251                                 const vec<tree> &dr_chain,
8252                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8253                                 bool analyze_only, bool dump_p,
8254                                 unsigned *n_perms, unsigned int *n_loads,
8255                                 bool dce_chain)
8256 {
8257   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8258   int vec_index = 0;
8259   tree vectype = SLP_TREE_VECTYPE (node);
8260   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8261   unsigned int mask_element;
8262   unsigned dr_group_size;
8263   machine_mode mode;
8264
8265   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8266     dr_group_size = 1;
8267   else
8268     {
8269       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8270       dr_group_size = DR_GROUP_SIZE (stmt_info);
8271     }
8272
8273   mode = TYPE_MODE (vectype);
8274   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8275   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8276
8277   /* Initialize the vect stmts of NODE to properly insert the generated
8278      stmts later.  */
8279   if (! analyze_only)
8280     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8281       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8282
8283   /* Generate permutation masks for every NODE. Number of masks for each NODE
8284      is equal to GROUP_SIZE.
8285      E.g., we have a group of three nodes with three loads from the same
8286      location in each node, and the vector size is 4. I.e., we have a
8287      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8288      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8289      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8290      ...
8291
8292      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8293      The last mask is illegal since we assume two operands for permute
8294      operation, and the mask element values can't be outside that range.
8295      Hence, the last mask must be converted into {2,5,5,5}.
8296      For the first two permutations we need the first and the second input
8297      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8298      we need the second and the third vectors: {b1,c1,a2,b2} and
8299      {c2,a3,b3,c3}.  */
8300
8301   int vect_stmts_counter = 0;
8302   unsigned int index = 0;
8303   int first_vec_index = -1;
8304   int second_vec_index = -1;
8305   bool noop_p = true;
8306   *n_perms = 0;
8307
8308   vec_perm_builder mask;
8309   unsigned int nelts_to_build;
8310   unsigned int nvectors_per_build;
8311   unsigned int in_nlanes;
8312   bool repeating_p = (group_size == dr_group_size
8313                       && multiple_p (nunits, group_size));
8314   if (repeating_p)
8315     {
8316       /* A single vector contains a whole number of copies of the node, so:
8317          (a) all permutes can use the same mask; and
8318          (b) the permutes only need a single vector input.  */
8319       mask.new_vector (nunits, group_size, 3);
8320       nelts_to_build = mask.encoded_nelts ();
8321       /* It's possible to obtain zero nstmts during analyze_only, so make
8322          it at least one to ensure the later computation for n_perms
8323          proceed.  */
8324       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8325       in_nlanes = dr_group_size * 3;
8326     }
8327   else
8328     {
8329       /* We need to construct a separate mask for each vector statement.  */
8330       unsigned HOST_WIDE_INT const_nunits, const_vf;
8331       if (!nunits.is_constant (&const_nunits)
8332           || !vf.is_constant (&const_vf))
8333         return false;
8334       mask.new_vector (const_nunits, const_nunits, 1);
8335       nelts_to_build = const_vf * group_size;
8336       nvectors_per_build = 1;
8337       in_nlanes = const_vf * dr_group_size;
8338     }
8339   auto_sbitmap used_in_lanes (in_nlanes);
8340   bitmap_clear (used_in_lanes);
8341   auto_bitmap used_defs;
8342
8343   unsigned int count = mask.encoded_nelts ();
8344   mask.quick_grow (count);
8345   vec_perm_indices indices;
8346
8347   for (unsigned int j = 0; j < nelts_to_build; j++)
8348     {
8349       unsigned int iter_num = j / group_size;
8350       unsigned int stmt_num = j % group_size;
8351       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8352       bitmap_set_bit (used_in_lanes, i);
8353       if (repeating_p)
8354         {
8355           first_vec_index = 0;
8356           mask_element = i;
8357         }
8358       else
8359         {
8360           /* Enforced before the loop when !repeating_p.  */
8361           unsigned int const_nunits = nunits.to_constant ();
8362           vec_index = i / const_nunits;
8363           mask_element = i % const_nunits;
8364           if (vec_index == first_vec_index
8365               || first_vec_index == -1)
8366             {
8367               first_vec_index = vec_index;
8368             }
8369           else if (vec_index == second_vec_index
8370                    || second_vec_index == -1)
8371             {
8372               second_vec_index = vec_index;
8373               mask_element += const_nunits;
8374             }
8375           else
8376             {
8377               if (dump_p)
8378                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8379                                  "permutation requires at "
8380                                  "least three vectors %G",
8381                                  stmt_info->stmt);
8382               gcc_assert (analyze_only);
8383               return false;
8384             }
8385
8386           gcc_assert (mask_element < 2 * const_nunits);
8387         }
8388
8389       if (mask_element != index)
8390         noop_p = false;
8391       mask[index++] = mask_element;
8392
8393       if (index == count)
8394         {
8395           if (!noop_p)
8396             {
8397               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8398               if (!can_vec_perm_const_p (mode, mode, indices))
8399                 {
8400                   if (dump_p)
8401                     {
8402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8403                                        "unsupported vect permute { ");
8404                       for (i = 0; i < count; ++i)
8405                         {
8406                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8407                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8408                         }
8409                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8410                     }
8411                   gcc_assert (analyze_only);
8412                   return false;
8413                 }
8414
8415               tree mask_vec = NULL_TREE;
8416               if (!analyze_only)
8417                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8418
8419               if (second_vec_index == -1)
8420                 second_vec_index = first_vec_index;
8421
8422               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8423                 {
8424                   ++*n_perms;
8425                   if (analyze_only)
8426                     continue;
8427                   /* Generate the permute statement if necessary.  */
8428                   tree first_vec = dr_chain[first_vec_index + ri];
8429                   tree second_vec = dr_chain[second_vec_index + ri];
8430                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8431                   tree perm_dest
8432                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8433                                                    vectype);
8434                   perm_dest = make_ssa_name (perm_dest);
8435                   gimple *perm_stmt
8436                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8437                                            second_vec, mask_vec);
8438                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8439                                                gsi);
8440                   if (dce_chain)
8441                     {
8442                       bitmap_set_bit (used_defs, first_vec_index + ri);
8443                       bitmap_set_bit (used_defs, second_vec_index + ri);
8444                     }
8445
8446                   /* Store the vector statement in NODE.  */
8447                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8448                 }
8449             }
8450           else if (!analyze_only)
8451             {
8452               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8453                 {
8454                   tree first_vec = dr_chain[first_vec_index + ri];
8455                   /* If mask was NULL_TREE generate the requested
8456                      identity transform.  */
8457                   if (dce_chain)
8458                     bitmap_set_bit (used_defs, first_vec_index + ri);
8459
8460                   /* Store the vector statement in NODE.  */
8461                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8462                 }
8463             }
8464
8465           index = 0;
8466           first_vec_index = -1;
8467           second_vec_index = -1;
8468           noop_p = true;
8469         }
8470     }
8471
8472   if (n_loads)
8473     {
8474       if (repeating_p)
8475         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8476       else
8477         {
8478           /* Enforced above when !repeating_p.  */
8479           unsigned int const_nunits = nunits.to_constant ();
8480           *n_loads = 0;
8481           bool load_seen = false;
8482           for (unsigned i = 0; i < in_nlanes; ++i)
8483             {
8484               if (i % const_nunits == 0)
8485                 {
8486                   if (load_seen)
8487                     *n_loads += 1;
8488                   load_seen = false;
8489                 }
8490               if (bitmap_bit_p (used_in_lanes, i))
8491                 load_seen = true;
8492             }
8493           if (load_seen)
8494             *n_loads += 1;
8495         }
8496     }
8497
8498   if (dce_chain)
8499     for (unsigned i = 0; i < dr_chain.length (); ++i)
8500       if (!bitmap_bit_p (used_defs, i))
8501         {
8502           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8503           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8504           gsi_remove (&rgsi, true);
8505           release_defs (stmt);
8506         }
8507
8508   return true;
8509 }
8510
8511 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8512    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8513    permute statements for the SLP node NODE.  Store the number of vector
8514    permute instructions in *N_PERMS and the number of vector load
8515    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8516    that were not needed.  */
8517
8518 bool
8519 vect_transform_slp_perm_load (vec_info *vinfo,
8520                               slp_tree node, const vec<tree> &dr_chain,
8521                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8522                               bool analyze_only, unsigned *n_perms,
8523                               unsigned int *n_loads, bool dce_chain)
8524 {
8525   return vect_transform_slp_perm_load_1 (vinfo, node,
8526                                          SLP_TREE_LOAD_PERMUTATION (node),
8527                                          dr_chain, gsi, vf, analyze_only,
8528                                          dump_enabled_p (), n_perms, n_loads,
8529                                          dce_chain);
8530 }
8531
8532 /* Produce the next vector result for SLP permutation NODE by adding a vector
8533    statement at GSI.  If MASK_VEC is nonnull, add:
8534
8535       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8536
8537    otherwise add:
8538
8539       <new SSA name> = FIRST_DEF.  */
8540
8541 static void
8542 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8543                           slp_tree node, tree first_def, tree second_def,
8544                           tree mask_vec, poly_uint64 identity_offset)
8545 {
8546   tree vectype = SLP_TREE_VECTYPE (node);
8547
8548   /* ???  We SLP match existing vector element extracts but
8549      allow punning which we need to re-instantiate at uses
8550      but have no good way of explicitly representing.  */
8551   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8552       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8553     {
8554       gassign *conv_stmt
8555         = gimple_build_assign (make_ssa_name (vectype),
8556                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8557       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8558       first_def = gimple_assign_lhs (conv_stmt);
8559     }
8560   gassign *perm_stmt;
8561   tree perm_dest = make_ssa_name (vectype);
8562   if (mask_vec)
8563     {
8564       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8565                            TYPE_SIZE (vectype))
8566           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8567         {
8568           gassign *conv_stmt
8569             = gimple_build_assign (make_ssa_name (vectype),
8570                                    build1 (VIEW_CONVERT_EXPR,
8571                                            vectype, second_def));
8572           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8573           second_def = gimple_assign_lhs (conv_stmt);
8574         }
8575       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8576                                        first_def, second_def,
8577                                        mask_vec);
8578     }
8579   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8580     {
8581       /* For identity permutes we still need to handle the case
8582          of offsetted extracts or concats.  */
8583       unsigned HOST_WIDE_INT c;
8584       auto first_def_nunits
8585         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8586       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8587         {
8588           unsigned HOST_WIDE_INT elsz
8589             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8590           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8591                                  TYPE_SIZE (vectype),
8592                                  bitsize_int (identity_offset * elsz));
8593           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8594         }
8595       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8596                                     first_def_nunits, &c) && c == 2)
8597         {
8598           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8599                                             NULL_TREE, second_def);
8600           perm_stmt = gimple_build_assign (perm_dest, ctor);
8601         }
8602       else
8603         gcc_unreachable ();
8604     }
8605   else
8606     {
8607       /* We need a copy here in case the def was external.  */
8608       perm_stmt = gimple_build_assign (perm_dest, first_def);
8609     }
8610   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8611   /* Store the vector statement in NODE.  */
8612   node->push_vec_def (perm_stmt);
8613 }
8614
8615 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8616    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8617    If GSI is nonnull, emit the permutation there.
8618
8619    When GSI is null, the only purpose of NODE is to give properties
8620    of the result, such as the vector type and number of SLP lanes.
8621    The node does not need to be a VEC_PERM_EXPR.
8622
8623    If the target supports the operation, return the number of individual
8624    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8625    dump file if DUMP_P is true.  */
8626
8627 static int
8628 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8629                                 slp_tree node, lane_permutation_t &perm,
8630                                 vec<slp_tree> &children, bool dump_p)
8631 {
8632   tree vectype = SLP_TREE_VECTYPE (node);
8633
8634   /* ???  We currently only support all same vector input types
8635      while the SLP IL should really do a concat + select and thus accept
8636      arbitrary mismatches.  */
8637   slp_tree child;
8638   unsigned i;
8639   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8640   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8641   tree op_vectype = NULL_TREE;
8642   FOR_EACH_VEC_ELT (children, i, child)
8643     if (SLP_TREE_VECTYPE (child))
8644       {
8645         op_vectype = SLP_TREE_VECTYPE (child);
8646         break;
8647       }
8648   if (!op_vectype)
8649     op_vectype = vectype;
8650   FOR_EACH_VEC_ELT (children, i, child)
8651     {
8652       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8653            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8654           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8655           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8656         {
8657           if (dump_p)
8658             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8659                              "Unsupported vector types in lane permutation\n");
8660           return -1;
8661         }
8662       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8663         repeating_p = false;
8664     }
8665
8666   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8667   if (dump_p)
8668     {
8669       dump_printf_loc (MSG_NOTE, vect_location,
8670                        "vectorizing permutation");
8671       for (unsigned i = 0; i < perm.length (); ++i)
8672         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8673       if (repeating_p)
8674         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8675       dump_printf (MSG_NOTE, "\n");
8676     }
8677
8678   /* REPEATING_P is true if every output vector is guaranteed to use the
8679      same permute vector.  We can handle that case for both variable-length
8680      and constant-length vectors, but we only handle other cases for
8681      constant-length vectors.
8682
8683      Set:
8684
8685      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8686        mask vector that we want to build.
8687
8688      - NCOPIES to the number of copies of PERM that we need in order
8689        to build the necessary permute mask vectors.
8690
8691      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8692        for each permute mask vector.  This is only relevant when GSI is
8693        nonnull.  */
8694   uint64_t npatterns;
8695   unsigned nelts_per_pattern;
8696   uint64_t ncopies;
8697   unsigned noutputs_per_mask;
8698   if (repeating_p)
8699     {
8700       /* We need a single permute mask vector that has the form:
8701
8702            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8703
8704          In other words, the original n-element permute in PERM is
8705          "unrolled" to fill a full vector.  The stepped vector encoding
8706          that we use for permutes requires 3n elements.  */
8707       npatterns = SLP_TREE_LANES (node);
8708       nelts_per_pattern = ncopies = 3;
8709       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8710     }
8711   else
8712     {
8713       /* Calculate every element of every permute mask vector explicitly,
8714          instead of relying on the pattern described above.  */
8715       if (!nunits.is_constant (&npatterns))
8716         return -1;
8717       nelts_per_pattern = ncopies = 1;
8718       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8719         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8720           return -1;
8721       noutputs_per_mask = 1;
8722     }
8723   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8724   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8725
8726   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8727      from the { SLP operand, scalar lane } permutation as recorded in the
8728      SLP node as intermediate step.  This part should already work
8729      with SLP children with arbitrary number of lanes.  */
8730   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8731   auto_vec<unsigned> active_lane;
8732   vperm.create (olanes);
8733   active_lane.safe_grow_cleared (children.length (), true);
8734   for (unsigned i = 0; i < ncopies; ++i)
8735     {
8736       for (unsigned pi = 0; pi < perm.length (); ++pi)
8737         {
8738           std::pair<unsigned, unsigned> p = perm[pi];
8739           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8740           if (repeating_p)
8741             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8742           else
8743             {
8744               /* We checked above that the vectors are constant-length.  */
8745               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8746               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8747               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8748               vperm.quick_push ({{p.first, vi}, vl});
8749             }
8750         }
8751       /* Advance to the next group.  */
8752       for (unsigned j = 0; j < children.length (); ++j)
8753         active_lane[j] += SLP_TREE_LANES (children[j]);
8754     }
8755
8756   if (dump_p)
8757     {
8758       dump_printf_loc (MSG_NOTE, vect_location,
8759                        "vectorizing permutation");
8760       for (unsigned i = 0; i < perm.length (); ++i)
8761         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8762       if (repeating_p)
8763         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8764       dump_printf (MSG_NOTE, "\n");
8765       dump_printf_loc (MSG_NOTE, vect_location, "as");
8766       for (unsigned i = 0; i < vperm.length (); ++i)
8767         {
8768           if (i != 0
8769               && (repeating_p
8770                   ? multiple_p (i, npatterns)
8771                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8772             dump_printf (MSG_NOTE, ",");
8773           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8774                        vperm[i].first.first, vperm[i].first.second,
8775                        vperm[i].second);
8776         }
8777       dump_printf (MSG_NOTE, "\n");
8778     }
8779
8780   /* We can only handle two-vector permutes, everything else should
8781      be lowered on the SLP level.  The following is closely inspired
8782      by vect_transform_slp_perm_load and is supposed to eventually
8783      replace it.
8784      ???   As intermediate step do code-gen in the SLP tree representation
8785      somehow?  */
8786   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8787   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8788   unsigned int index = 0;
8789   poly_uint64 mask_element;
8790   vec_perm_builder mask;
8791   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8792   unsigned int count = mask.encoded_nelts ();
8793   mask.quick_grow (count);
8794   vec_perm_indices indices;
8795   unsigned nperms = 0;
8796   for (unsigned i = 0; i < vperm.length (); ++i)
8797     {
8798       mask_element = vperm[i].second;
8799       if (first_vec.first == -1U
8800           || first_vec == vperm[i].first)
8801         first_vec = vperm[i].first;
8802       else if (second_vec.first == -1U
8803                || second_vec == vperm[i].first)
8804         {
8805           second_vec = vperm[i].first;
8806           mask_element += nunits;
8807         }
8808       else
8809         {
8810           if (dump_p)
8811             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8812                              "permutation requires at "
8813                              "least three vectors\n");
8814           gcc_assert (!gsi);
8815           return -1;
8816         }
8817
8818       mask[index++] = mask_element;
8819
8820       if (index == count)
8821         {
8822           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8823                               TYPE_VECTOR_SUBPARTS (op_vectype));
8824           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8825                              && constant_multiple_p (mask[0], nunits));
8826           machine_mode vmode = TYPE_MODE (vectype);
8827           machine_mode op_vmode = TYPE_MODE (op_vectype);
8828           unsigned HOST_WIDE_INT c;
8829           if ((!identity_p
8830                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8831               || (identity_p
8832                   && !known_le (nunits,
8833                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8834                   && (!constant_multiple_p (nunits,
8835                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8836                                             &c) || c != 2)))
8837             {
8838               if (dump_p)
8839                 {
8840                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8841                                    vect_location,
8842                                    "unsupported vect permute { ");
8843                   for (i = 0; i < count; ++i)
8844                     {
8845                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8846                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8847                     }
8848                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8849                 }
8850               gcc_assert (!gsi);
8851               return -1;
8852             }
8853
8854           if (!identity_p)
8855             nperms++;
8856           if (gsi)
8857             {
8858               if (second_vec.first == -1U)
8859                 second_vec = first_vec;
8860
8861               slp_tree
8862                 first_node = children[first_vec.first],
8863                 second_node = children[second_vec.first];
8864
8865               tree mask_vec = NULL_TREE;
8866               if (!identity_p)
8867                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8868
8869               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8870                 {
8871                   tree first_def
8872                     = vect_get_slp_vect_def (first_node,
8873                                              first_vec.second + vi);
8874                   tree second_def
8875                     = vect_get_slp_vect_def (second_node,
8876                                              second_vec.second + vi);
8877                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8878                                             second_def, mask_vec, mask[0]);
8879                 }
8880             }
8881
8882           index = 0;
8883           first_vec = std::make_pair (-1U, -1U);
8884           second_vec = std::make_pair (-1U, -1U);
8885         }
8886     }
8887
8888   return nperms;
8889 }
8890
8891 /* Vectorize the SLP permutations in NODE as specified
8892    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8893    child number and lane number.
8894    Interleaving of two two-lane two-child SLP subtrees (not supported):
8895      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8896    A blend of two four-lane two-child SLP subtrees:
8897      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8898    Highpart of a four-lane one-child SLP subtree (not supported):
8899      [ { 0, 2 }, { 0, 3 } ]
8900    Where currently only a subset is supported by code generating below.  */
8901
8902 static bool
8903 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8904                               slp_tree node, stmt_vector_for_cost *cost_vec)
8905 {
8906   tree vectype = SLP_TREE_VECTYPE (node);
8907   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8908   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8909                                                SLP_TREE_CHILDREN (node),
8910                                                dump_enabled_p ());
8911   if (nperms < 0)
8912     return false;
8913
8914   if (!gsi)
8915     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8916
8917   return true;
8918 }
8919
8920 /* Vectorize SLP NODE.  */
8921
8922 static void
8923 vect_schedule_slp_node (vec_info *vinfo,
8924                         slp_tree node, slp_instance instance)
8925 {
8926   gimple_stmt_iterator si;
8927   int i;
8928   slp_tree child;
8929
8930   /* For existing vectors there's nothing to do.  */
8931   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8932       && SLP_TREE_VEC_DEFS (node).exists ())
8933     return;
8934
8935   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8936
8937   /* Vectorize externals and constants.  */
8938   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8939       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8940     {
8941       /* ???  vectorizable_shift can end up using a scalar operand which is
8942          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8943          node in this case.  */
8944       if (!SLP_TREE_VECTYPE (node))
8945         return;
8946
8947       vect_create_constant_vectors (vinfo, node);
8948       return;
8949     }
8950
8951   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8952
8953   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8954   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8955
8956   if (dump_enabled_p ())
8957     dump_printf_loc (MSG_NOTE, vect_location,
8958                      "------>vectorizing SLP node starting from: %G",
8959                      stmt_info->stmt);
8960
8961   if (STMT_VINFO_DATA_REF (stmt_info)
8962       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8963     {
8964       /* Vectorized loads go before the first scalar load to make it
8965          ready early, vectorized stores go before the last scalar
8966          stmt which is where all uses are ready.  */
8967       stmt_vec_info last_stmt_info = NULL;
8968       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8969         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8970       else /* DR_IS_WRITE */
8971         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8972       si = gsi_for_stmt (last_stmt_info->stmt);
8973     }
8974   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8975             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8976             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8977            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8978     {
8979       /* For PHI node vectorization we do not use the insertion iterator.  */
8980       si = gsi_none ();
8981     }
8982   else
8983     {
8984       /* Emit other stmts after the children vectorized defs which is
8985          earliest possible.  */
8986       gimple *last_stmt = NULL;
8987       bool seen_vector_def = false;
8988       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8989         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8990           {
8991             /* For fold-left reductions we are retaining the scalar
8992                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8993                set so the representation isn't perfect.  Resort to the
8994                last scalar def here.  */
8995             if (SLP_TREE_VEC_DEFS (child).is_empty ())
8996               {
8997                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8998                             == cycle_phi_info_type);
8999                 gphi *phi = as_a <gphi *>
9000                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9001                 if (!last_stmt
9002                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9003                   last_stmt = phi;
9004               }
9005             /* We are emitting all vectorized stmts in the same place and
9006                the last one is the last.
9007                ???  Unless we have a load permutation applied and that
9008                figures to re-use an earlier generated load.  */
9009             unsigned j;
9010             tree vdef;
9011             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9012               {
9013                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9014                 if (!last_stmt
9015                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9016                   last_stmt = vstmt;
9017               }
9018           }
9019         else if (!SLP_TREE_VECTYPE (child))
9020           {
9021             /* For externals we use unvectorized at all scalar defs.  */
9022             unsigned j;
9023             tree def;
9024             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9025               if (TREE_CODE (def) == SSA_NAME
9026                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9027                 {
9028                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9029                   if (!last_stmt
9030                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9031                     last_stmt = stmt;
9032                 }
9033           }
9034         else
9035           {
9036             /* For externals we have to look at all defs since their
9037                insertion place is decided per vector.  But beware
9038                of pre-existing vectors where we need to make sure
9039                we do not insert before the region boundary.  */
9040             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9041                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9042               seen_vector_def = true;
9043             else
9044               {
9045                 unsigned j;
9046                 tree vdef;
9047                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9048                   if (TREE_CODE (vdef) == SSA_NAME
9049                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9050                     {
9051                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9052                       if (!last_stmt
9053                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9054                         last_stmt = vstmt;
9055                     }
9056               }
9057           }
9058       /* This can happen when all children are pre-existing vectors or
9059          constants.  */
9060       if (!last_stmt)
9061         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9062       if (!last_stmt)
9063         {
9064           gcc_assert (seen_vector_def);
9065           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9066         }
9067       else if (is_ctrl_altering_stmt (last_stmt))
9068         {
9069           /* We split regions to vectorize at control altering stmts
9070              with a definition so this must be an external which
9071              we can insert at the start of the region.  */
9072           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9073         }
9074       else if (is_a <bb_vec_info> (vinfo)
9075                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9076                && gimple_could_trap_p (stmt_info->stmt))
9077         {
9078           /* We've constrained possibly trapping operations to all come
9079              from the same basic-block, if vectorized defs would allow earlier
9080              scheduling still force vectorized stmts to the original block.
9081              This is only necessary for BB vectorization since for loop vect
9082              all operations are in a single BB and scalar stmt based
9083              placement doesn't play well with epilogue vectorization.  */
9084           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9085                                       gimple_bb (stmt_info->stmt),
9086                                       gimple_bb (last_stmt)));
9087           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9088         }
9089       else if (is_a <gphi *> (last_stmt))
9090         si = gsi_after_labels (gimple_bb (last_stmt));
9091       else
9092         {
9093           si = gsi_for_stmt (last_stmt);
9094           gsi_next (&si);
9095         }
9096     }
9097
9098   /* Handle purely internal nodes.  */
9099   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9100     {
9101       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9102          be shared with different SLP nodes (but usually it's the same
9103          operation apart from the case the stmt is only there for denoting
9104          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9105          but open-code it here (partly).  */
9106       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9107       gcc_assert (done);
9108       stmt_vec_info slp_stmt_info;
9109       unsigned int i;
9110       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9111         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9112           {
9113             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9114                                                 instance, i, true, NULL);
9115             gcc_assert (done);
9116           }
9117     }
9118   else
9119     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9120 }
9121
9122 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9123    For loop vectorization this is done in vectorizable_call, but for SLP
9124    it needs to be deferred until end of vect_schedule_slp, because multiple
9125    SLP instances may refer to the same scalar stmt.  */
9126
9127 static void
9128 vect_remove_slp_scalar_calls (vec_info *vinfo,
9129                               slp_tree node, hash_set<slp_tree> &visited)
9130 {
9131   gimple *new_stmt;
9132   gimple_stmt_iterator gsi;
9133   int i;
9134   slp_tree child;
9135   tree lhs;
9136   stmt_vec_info stmt_info;
9137
9138   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9139     return;
9140
9141   if (visited.add (node))
9142     return;
9143
9144   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9145     vect_remove_slp_scalar_calls (vinfo, child, visited);
9146
9147   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9148     {
9149       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9150       if (!stmt || gimple_bb (stmt) == NULL)
9151         continue;
9152       if (is_pattern_stmt_p (stmt_info)
9153           || !PURE_SLP_STMT (stmt_info))
9154         continue;
9155       lhs = gimple_call_lhs (stmt);
9156       if (lhs)
9157         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9158       else
9159         {
9160           new_stmt = gimple_build_nop ();
9161           unlink_stmt_vdef (stmt_info->stmt);
9162         }
9163       gsi = gsi_for_stmt (stmt);
9164       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9165       if (lhs)
9166         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9167     }
9168 }
9169
9170 static void
9171 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9172 {
9173   hash_set<slp_tree> visited;
9174   vect_remove_slp_scalar_calls (vinfo, node, visited);
9175 }
9176
9177 /* Vectorize the instance root.  */
9178
9179 void
9180 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9181 {
9182   gassign *rstmt = NULL;
9183
9184   if (instance->kind == slp_inst_kind_ctor)
9185     {
9186       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9187         {
9188           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9189           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9190           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9191                                           TREE_TYPE (vect_lhs)))
9192             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9193                                vect_lhs);
9194           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9195         }
9196       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9197         {
9198           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9199           tree child_def;
9200           int j;
9201           vec<constructor_elt, va_gc> *v;
9202           vec_alloc (v, nelts);
9203
9204           /* A CTOR can handle V16HI composition from VNx8HI so we
9205              do not need to convert vector elements if the types
9206              do not match.  */
9207           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9208             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9209           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9210           tree rtype
9211             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9212           tree r_constructor = build_constructor (rtype, v);
9213           rstmt = gimple_build_assign (lhs, r_constructor);
9214         }
9215     }
9216   else if (instance->kind == slp_inst_kind_bb_reduc)
9217     {
9218       /* Largely inspired by reduction chain epilogue handling in
9219          vect_create_epilog_for_reduction.  */
9220       vec<tree> vec_defs = vNULL;
9221       vect_get_slp_defs (node, &vec_defs);
9222       enum tree_code reduc_code
9223         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9224       /* ???  We actually have to reflect signs somewhere.  */
9225       if (reduc_code == MINUS_EXPR)
9226         reduc_code = PLUS_EXPR;
9227       gimple_seq epilogue = NULL;
9228       /* We may end up with more than one vector result, reduce them
9229          to one vector.  */
9230       tree vec_def = vec_defs[0];
9231       tree vectype = TREE_TYPE (vec_def);
9232       tree compute_vectype = vectype;
9233       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9234                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9235                                  && operation_can_overflow (reduc_code));
9236       if (pun_for_overflow_p)
9237         {
9238           compute_vectype = unsigned_type_for (vectype);
9239           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9240                                   compute_vectype, vec_def);
9241         }
9242       for (unsigned i = 1; i < vec_defs.length (); ++i)
9243         {
9244           tree def = vec_defs[i];
9245           if (pun_for_overflow_p)
9246             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9247                                 compute_vectype, def);
9248           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9249                                   vec_def, def);
9250         }
9251       vec_defs.release ();
9252       /* ???  Support other schemes than direct internal fn.  */
9253       internal_fn reduc_fn;
9254       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9255           || reduc_fn == IFN_LAST)
9256         gcc_unreachable ();
9257       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9258                                       TREE_TYPE (compute_vectype), vec_def);
9259       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9260         {
9261           tree rem_def = NULL_TREE;
9262           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9263             {
9264               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9265               if (!rem_def)
9266                 rem_def = def;
9267               else
9268                 rem_def = gimple_build (&epilogue, reduc_code,
9269                                         TREE_TYPE (scalar_def),
9270                                         rem_def, def);
9271             }
9272           scalar_def = gimple_build (&epilogue, reduc_code,
9273                                      TREE_TYPE (scalar_def),
9274                                      scalar_def, rem_def);
9275         }
9276       scalar_def = gimple_convert (&epilogue,
9277                                    TREE_TYPE (vectype), scalar_def);
9278       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9279       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9280       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9281       update_stmt (gsi_stmt (rgsi));
9282       return;
9283     }
9284   else
9285     gcc_unreachable ();
9286
9287   gcc_assert (rstmt);
9288
9289   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9290   gsi_replace (&rgsi, rstmt, true);
9291 }
9292
9293 struct slp_scc_info
9294 {
9295   bool on_stack;
9296   int dfs;
9297   int lowlink;
9298 };
9299
9300 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9301
9302 static void
9303 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9304                    hash_map<slp_tree, slp_scc_info> &scc_info,
9305                    int &maxdfs, vec<slp_tree> &stack)
9306 {
9307   bool existed_p;
9308   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9309   gcc_assert (!existed_p);
9310   info->dfs = maxdfs;
9311   info->lowlink = maxdfs;
9312   maxdfs++;
9313
9314   /* Leaf.  */
9315   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9316     {
9317       info->on_stack = false;
9318       vect_schedule_slp_node (vinfo, node, instance);
9319       return;
9320     }
9321
9322   info->on_stack = true;
9323   stack.safe_push (node);
9324
9325   unsigned i;
9326   slp_tree child;
9327   /* DFS recurse.  */
9328   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9329     {
9330       if (!child)
9331         continue;
9332       slp_scc_info *child_info = scc_info.get (child);
9333       if (!child_info)
9334         {
9335           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9336           /* Recursion might have re-allocated the node.  */
9337           info = scc_info.get (node);
9338           child_info = scc_info.get (child);
9339           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9340         }
9341       else if (child_info->on_stack)
9342         info->lowlink = MIN (info->lowlink, child_info->dfs);
9343     }
9344   if (info->lowlink != info->dfs)
9345     return;
9346
9347   auto_vec<slp_tree, 4> phis_to_fixup;
9348
9349   /* Singleton.  */
9350   if (stack.last () == node)
9351     {
9352       stack.pop ();
9353       info->on_stack = false;
9354       vect_schedule_slp_node (vinfo, node, instance);
9355       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9356           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9357         phis_to_fixup.quick_push (node);
9358     }
9359   else
9360     {
9361       /* SCC.  */
9362       int last_idx = stack.length () - 1;
9363       while (stack[last_idx] != node)
9364         last_idx--;
9365       /* We can break the cycle at PHIs who have at least one child
9366          code generated.  Then we could re-start the DFS walk until
9367          all nodes in the SCC are covered (we might have new entries
9368          for only back-reachable nodes).  But it's simpler to just
9369          iterate and schedule those that are ready.  */
9370       unsigned todo = stack.length () - last_idx;
9371       do
9372         {
9373           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9374             {
9375               slp_tree entry = stack[idx];
9376               if (!entry)
9377                 continue;
9378               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9379                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9380               bool ready = !phi;
9381               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9382                   if (!child)
9383                     {
9384                       gcc_assert (phi);
9385                       ready = true;
9386                       break;
9387                     }
9388                   else if (scc_info.get (child)->on_stack)
9389                     {
9390                       if (!phi)
9391                         {
9392                           ready = false;
9393                           break;
9394                         }
9395                     }
9396                   else
9397                     {
9398                       if (phi)
9399                         {
9400                           ready = true;
9401                           break;
9402                         }
9403                     }
9404               if (ready)
9405                 {
9406                   vect_schedule_slp_node (vinfo, entry, instance);
9407                   scc_info.get (entry)->on_stack = false;
9408                   stack[idx] = NULL;
9409                   todo--;
9410                   if (phi)
9411                     phis_to_fixup.safe_push (entry);
9412                 }
9413             }
9414         }
9415       while (todo != 0);
9416
9417       /* Pop the SCC.  */
9418       stack.truncate (last_idx);
9419     }
9420
9421   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9422   slp_tree phi_node;
9423   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9424     {
9425       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9426       edge_iterator ei;
9427       edge e;
9428       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9429         {
9430           unsigned dest_idx = e->dest_idx;
9431           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9432           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9433             continue;
9434           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9435           /* Simply fill all args.  */
9436           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9437               != vect_first_order_recurrence)
9438             for (unsigned i = 0; i < n; ++i)
9439               {
9440                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9441                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9442                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9443                              e, gimple_phi_arg_location (phi, dest_idx));
9444               }
9445           else
9446             {
9447               /* Unless it is a first order recurrence which needs
9448                  args filled in for both the PHI node and the permutes.  */
9449               gimple *perm
9450                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9451               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9452               add_phi_arg (as_a <gphi *> (rphi),
9453                            vect_get_slp_vect_def (child, n - 1),
9454                            e, gimple_phi_arg_location (phi, dest_idx));
9455               for (unsigned i = 0; i < n; ++i)
9456                 {
9457                   gimple *perm
9458                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9459                   if (i > 0)
9460                     gimple_assign_set_rhs1 (perm,
9461                                             vect_get_slp_vect_def (child, i - 1));
9462                   gimple_assign_set_rhs2 (perm,
9463                                           vect_get_slp_vect_def (child, i));
9464                   update_stmt (perm);
9465                 }
9466             }
9467         }
9468     }
9469 }
9470
9471 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9472
9473 void
9474 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9475 {
9476   slp_instance instance;
9477   unsigned int i;
9478
9479   hash_map<slp_tree, slp_scc_info> scc_info;
9480   int maxdfs = 0;
9481   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9482     {
9483       slp_tree node = SLP_INSTANCE_TREE (instance);
9484       if (dump_enabled_p ())
9485         {
9486           dump_printf_loc (MSG_NOTE, vect_location,
9487                            "Vectorizing SLP tree:\n");
9488           /* ???  Dump all?  */
9489           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9490             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9491                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9492           vect_print_slp_graph (MSG_NOTE, vect_location,
9493                                 SLP_INSTANCE_TREE (instance));
9494         }
9495       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9496          have a PHI be the node breaking the cycle.  */
9497       auto_vec<slp_tree> stack;
9498       if (!scc_info.get (node))
9499         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9500
9501       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9502         vectorize_slp_instance_root_stmt (node, instance);
9503
9504       if (dump_enabled_p ())
9505         dump_printf_loc (MSG_NOTE, vect_location,
9506                          "vectorizing stmts using SLP.\n");
9507     }
9508
9509   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9510     {
9511       slp_tree root = SLP_INSTANCE_TREE (instance);
9512       stmt_vec_info store_info;
9513       unsigned int j;
9514
9515       /* Remove scalar call stmts.  Do not do this for basic-block
9516          vectorization as not all uses may be vectorized.
9517          ???  Why should this be necessary?  DCE should be able to
9518          remove the stmts itself.
9519          ???  For BB vectorization we can as well remove scalar
9520          stmts starting from the SLP tree root if they have no
9521          uses.  */
9522       if (is_a <loop_vec_info> (vinfo))
9523         vect_remove_slp_scalar_calls (vinfo, root);
9524
9525       /* Remove vectorized stores original scalar stmts.  */
9526       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9527         {
9528           if (!STMT_VINFO_DATA_REF (store_info)
9529               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9530             break;
9531
9532           store_info = vect_orig_stmt (store_info);
9533           /* Free the attached stmt_vec_info and remove the stmt.  */
9534           vinfo->remove_stmt (store_info);
9535
9536           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9537              to not crash in vect_free_slp_tree later.  */
9538           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9539             SLP_TREE_REPRESENTATIVE (root) = NULL;
9540         }
9541     }
9542 }