gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 121   SLP_TREE_CODE (this) = ERROR_MARK;
 122   SLP_TREE_VECTYPE (this) = NULL_TREE;
 123   SLP_TREE_REPRESENTATIVE (this) = NULL;
 124   SLP_TREE_REF_COUNT (this) = 1;
 125   this->failed = NULL;
 126   this->max_nunits = 1;
 127   this->lanes = 0;
 128 }
 129
 130 /* Tear down a SLP node.  */
 131
 132 _slp_tree::~_slp_tree ()
 133 {
 134   if (this->prev_node)
 135     this->prev_node->next_node = this->next_node;
 136   else
 137     slp_first_node = this->next_node;
 138   if (this->next_node)
 139     this->next_node->prev_node = this->prev_node;
 140   SLP_TREE_CHILDREN (this).release ();
 141   SLP_TREE_SCALAR_STMTS (this).release ();
 142   SLP_TREE_SCALAR_OPS (this).release ();
 143   SLP_TREE_VEC_DEFS (this).release ();
 144   SLP_TREE_LOAD_PERMUTATION (this).release ();
 145   SLP_TREE_LANE_PERMUTATION (this).release ();
 146   if (this->failed)
 147     free (failed);
 148 }
 149
 150 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 151
 152 void
 153 _slp_tree::push_vec_def (gimple *def)
 154 {
 155   if (gphi *phi = dyn_cast <gphi *> (def))
 156     vec_defs.quick_push (gimple_phi_result (phi));
 157   else
 158     {
 159       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 160       vec_defs.quick_push (get_def_from_ptr (defop));
 161     }
 162 }
 163
 164 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 165
 166 void
 167 vect_free_slp_tree (slp_tree node)
 168 {
 169   int i;
 170   slp_tree child;
 171
 172   if (--SLP_TREE_REF_COUNT (node) != 0)
 173     return;
 174
 175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 176     if (child)
 177       vect_free_slp_tree (child);
 178
 179   /* If the node defines any SLP only patterns then those patterns are no
 180      longer valid and should be removed.  */
 181   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 182   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 183     {
 184       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 185       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 186       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 187     }
 188
 189   delete node;
 190 }
 191
 192 /* Return a location suitable for dumpings related to the SLP instance.  */
 193
 194 dump_user_location_t
 195 _slp_instance::location () const
 196 {
 197   if (!root_stmts.is_empty ())
 198     return root_stmts[0]->stmt;
 199   else
 200     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 201 }
 202
 203
 204 /* Free the memory allocated for the SLP instance.  */
 205
 206 void
 207 vect_free_slp_instance (slp_instance instance)
 208 {
 209   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 210   SLP_INSTANCE_LOADS (instance).release ();
 211   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 212   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 213   instance->subgraph_entries.release ();
 214   instance->cost_vec.release ();
 215   free (instance);
 216 }
 217
 218
 219 /* Create an SLP node for SCALAR_STMTS.  */
 220
 221 slp_tree
 222 vect_create_new_slp_node (unsigned nops, tree_code code)
 223 {
 224   slp_tree node = new _slp_tree;
 225   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 226   SLP_TREE_CHILDREN (node).create (nops);
 227   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 228   SLP_TREE_CODE (node) = code;
 229   return node;
 230 }
 231 /* Create an SLP node for SCALAR_STMTS.  */
 232
 233 static slp_tree
 234 vect_create_new_slp_node (slp_tree node,
 235                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 238   SLP_TREE_CHILDREN (node).create (nops);
 239   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 240   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 241   SLP_TREE_LANES (node) = scalar_stmts.length ();
 242   return node;
 243 }
 244
 245 /* Create an SLP node for SCALAR_STMTS.  */
 246
 247 static slp_tree
 248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 249 {
 250   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 251 }
 252
 253 /* Create an SLP node for OPS.  */
 254
 255 static slp_tree
 256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 257 {
 258   SLP_TREE_SCALAR_OPS (node) = ops;
 259   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 260   SLP_TREE_LANES (node) = ops.length ();
 261   return node;
 262 }
 263
 264 /* Create an SLP node for OPS.  */
 265
 266 static slp_tree
 267 vect_create_new_slp_node (vec<tree> ops)
 268 {
 269   return vect_create_new_slp_node (new _slp_tree, ops);
 270 }
 271
 272
 273 /* This structure is used in creation of an SLP tree.  Each instance
 274    corresponds to the same operand in a group of scalar stmts in an SLP
 275    node.  */
 276 typedef struct _slp_oprnd_info
 277 {
 278   /* Def-stmts for the operands.  */
 279   vec<stmt_vec_info> def_stmts;
 280   /* Operands.  */
 281   vec<tree> ops;
 282   /* Information about the first statement, its vector def-type, type, the
 283      operand itself in case it's constant, and an indication if it's a pattern
 284      stmt.  */
 285   tree first_op_type;
 286   enum vect_def_type first_dt;
 287   bool any_pattern;
 288 } *slp_oprnd_info;
 289
 290
 291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 292    operand.  */
 293 static vec<slp_oprnd_info>
 294 vect_create_oprnd_info (int nops, int group_size)
 295 {
 296   int i;
 297   slp_oprnd_info oprnd_info;
 298   vec<slp_oprnd_info> oprnds_info;
 299
 300   oprnds_info.create (nops);
 301   for (i = 0; i < nops; i++)
 302     {
 303       oprnd_info = XNEW (struct _slp_oprnd_info);
 304       oprnd_info->def_stmts.create (group_size);
 305       oprnd_info->ops.create (group_size);
 306       oprnd_info->first_dt = vect_uninitialized_def;
 307       oprnd_info->first_op_type = NULL_TREE;
 308       oprnd_info->any_pattern = false;
 309       oprnds_info.quick_push (oprnd_info);
 310     }
 311
 312   return oprnds_info;
 313 }
 314
 315
 316 /* Free operands info.  */
 317
 318 static void
 319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 320 {
 321   int i;
 322   slp_oprnd_info oprnd_info;
 323
 324   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 325     {
 326       oprnd_info->def_stmts.release ();
 327       oprnd_info->ops.release ();
 328       XDELETE (oprnd_info);
 329     }
 330
 331   oprnds_info.release ();
 332 }
 333
 334 /* Return the execution frequency of NODE (so that a higher value indicates
 335    a "more important" node when optimizing for speed).  */
 336
 337 static sreal
 338 vect_slp_node_weight (slp_tree node)
 339 {
 340   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 341   basic_block bb = gimple_bb (stmt_info->stmt);
 342   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 343 }
 344
 345 /* Return true if STMTS contains a pattern statement.  */
 346
 347 static bool
 348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 349 {
 350   stmt_vec_info stmt_info;
 351   unsigned int i;
 352   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 353     if (is_pattern_stmt_p (stmt_info))
 354       return true;
 355   return false;
 356 }
 357
 358 /* Return true when all lanes in the external or constant NODE have
 359    the same value.  */
 360
 361 static bool
 362 vect_slp_tree_uniform_p (slp_tree node)
 363 {
 364   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 365               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 366
 367   /* Pre-exsting vectors.  */
 368   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 369     return false;
 370
 371   unsigned i;
 372   tree op, first = NULL_TREE;
 373   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 374     if (!first)
 375       first = op;
 376     else if (!operand_equal_p (first, op, 0))
 377       return false;
 378
 379   return true;
 380 }
 381
 382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 383    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 384    of the chain.  */
 385
 386 int
 387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 388                                       stmt_vec_info first_stmt_info)
 389 {
 390   stmt_vec_info next_stmt_info = first_stmt_info;
 391   int result = 0;
 392
 393   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 394     return -1;
 395
 396   do
 397     {
 398       if (next_stmt_info == stmt_info)
 399         return result;
 400       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 401       if (next_stmt_info)
 402         result += DR_GROUP_GAP (next_stmt_info);
 403     }
 404   while (next_stmt_info);
 405
 406   return -1;
 407 }
 408
 409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 410    using the method implemented by duplicate_and_interleave.  Return true
 411    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 412    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 413    (if nonnull).  */
 414
 415 bool
 416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 417                                 tree elt_type, unsigned int *nvectors_out,
 418                                 tree *vector_type_out,
 419                                 tree *permutes)
 420 {
 421   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 422   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 423     return false;
 424
 425   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 426   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 427   unsigned int nvectors = 1;
 428   for (;;)
 429     {
 430       scalar_int_mode int_mode;
 431       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 432       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 433         {
 434           /* Get the natural vector type for this SLP group size.  */
 435           tree int_type = build_nonstandard_integer_type
 436             (GET_MODE_BITSIZE (int_mode), 1);
 437           tree vector_type
 438             = get_vectype_for_scalar_type (vinfo, int_type, count);
 439           poly_int64 half_nelts;
 440           if (vector_type
 441               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 442               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 443                            GET_MODE_SIZE (base_vector_mode))
 444               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 445                              2, &half_nelts))
 446             {
 447               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 448                  together into elements of type INT_TYPE and using the result
 449                  to build NVECTORS vectors.  */
 450               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 451               vec_perm_builder sel1 (nelts, 2, 3);
 452               vec_perm_builder sel2 (nelts, 2, 3);
 453
 454               for (unsigned int i = 0; i < 3; ++i)
 455                 {
 456                   sel1.quick_push (i);
 457                   sel1.quick_push (i + nelts);
 458                   sel2.quick_push (half_nelts + i);
 459                   sel2.quick_push (half_nelts + i + nelts);
 460                 }
 461               vec_perm_indices indices1 (sel1, 2, nelts);
 462               vec_perm_indices indices2 (sel2, 2, nelts);
 463               machine_mode vmode = TYPE_MODE (vector_type);
 464               if (can_vec_perm_const_p (vmode, vmode, indices1)
 465                   && can_vec_perm_const_p (vmode, vmode, indices2))
 466                 {
 467                   if (nvectors_out)
 468                     *nvectors_out = nvectors;
 469                   if (vector_type_out)
 470                     *vector_type_out = vector_type;
 471                   if (permutes)
 472                     {
 473                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 474                                                                 indices1);
 475                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 476                                                                 indices2);
 477                     }
 478                   return true;
 479                 }
 480             }
 481         }
 482       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 483         return false;
 484       nvectors *= 2;
 485     }
 486 }
 487
 488 /* Return true if DTA and DTB match.  */
 489
 490 static bool
 491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 492 {
 493   return (dta == dtb
 494           || ((dta == vect_external_def || dta == vect_constant_def)
 495               && (dtb == vect_external_def || dtb == vect_constant_def)));
 496 }
 497
 498 static const int cond_expr_maps[3][5] = {
 499   { 4, -1, -2, 1, 2 },
 500   { 4, -2, -1, 1, 2 },
 501   { 4, -1, -2, 2, 1 }
 502 };
 503 static const int arg1_map[] = { 1, 1 };
 504 static const int arg2_map[] = { 1, 2 };
 505 static const int arg1_arg4_map[] = { 2, 1, 4 };
 506 static const int arg3_arg2_map[] = { 2, 3, 2 };
 507 static const int op1_op0_map[] = { 2, 1, 0 };
 508
 509 /* For most SLP statements, there is a one-to-one mapping between
 510    gimple arguments and child nodes.  If that is not true for STMT,
 511    return an array that contains:
 512
 513    - the number of child nodes, followed by
 514    - for each child node, the index of the argument associated with that node.
 515      The special index -1 is the first operand of an embedded comparison and
 516      the special index -2 is the second operand of an embedded comparison.
 517
 518    SWAP is as for vect_get_and_check_slp_defs.  */
 519
 520 static const int *
 521 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 522 {
 523   if (auto assign = dyn_cast<const gassign *> (stmt))
 524     {
 525       if (gimple_assign_rhs_code (assign) == COND_EXPR
 526           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 527         return cond_expr_maps[swap];
 528       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 529           && swap)
 530         return op1_op0_map;
 531     }
 532   gcc_assert (!swap);
 533   if (auto call = dyn_cast<const gcall *> (stmt))
 534     {
 535       if (gimple_call_internal_p (call))
 536         switch (gimple_call_internal_fn (call))
 537           {
 538           case IFN_MASK_LOAD:
 539             return arg2_map;
 540
 541           case IFN_GATHER_LOAD:
 542             return arg1_map;
 543
 544           case IFN_MASK_GATHER_LOAD:
 545             return arg1_arg4_map;
 546
 547           case IFN_MASK_STORE:
 548             return arg3_arg2_map;
 549
 550           default:
 551             break;
 552           }
 553     }
 554   return nullptr;
 555 }
 556
 557 /* Return the SLP node child index for operand OP of STMT.  */
 558
 559 int
 560 vect_slp_child_index_for_operand (const gimple *stmt, int op)
 561 {
 562   const int *opmap = vect_get_operand_map (stmt);
 563   if (!opmap)
 564     return op;
 565   for (int i = 1; i < 1 + opmap[0]; ++i)
 566     if (opmap[i] == op)
 567       return i - 1;
 568   gcc_unreachable ();
 569 }
 570
 571 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 572    they are of a valid type and that they match the defs of the first stmt of
 573    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 574    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 575    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 576    is 1 if STMT is cond and operands of comparison need to be swapped;
 577    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 578
 579    If there was a fatal error return -1; if the error could be corrected by
 580    swapping operands of father node of this one, return 1; if everything is
 581    ok return 0.  */
 582 static int
 583 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 584                              bool *skip_args,
 585                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 586                              vec<slp_oprnd_info> *oprnds_info)
 587 {
 588   stmt_vec_info stmt_info = stmts[stmt_num];
 589   tree oprnd;
 590   unsigned int i, number_of_oprnds;
 591   enum vect_def_type dt = vect_uninitialized_def;
 592   slp_oprnd_info oprnd_info;
 593   unsigned int commutative_op = -1U;
 594   bool first = stmt_num == 0;
 595
 596   if (!is_a<gcall *> (stmt_info->stmt)
 597       && !is_a<gassign *> (stmt_info->stmt)
 598       && !is_a<gphi *> (stmt_info->stmt))
 599     return -1;
 600
 601   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 602   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 603   if (map)
 604     number_of_oprnds = *map++;
 605   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 606     {
 607       if (gimple_call_internal_p (stmt))
 608         {
 609           internal_fn ifn = gimple_call_internal_fn (stmt);
 610           commutative_op = first_commutative_argument (ifn);
 611         }
 612     }
 613   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 614     {
 615       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 616         commutative_op = 0;
 617     }
 618
 619   bool swapped = (swap != 0);
 620   bool backedge = false;
 621   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 622   for (i = 0; i < number_of_oprnds; i++)
 623     {
 624       int opno = map ? map[i] : int (i);
 625       if (opno < 0)
 626         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 627       else
 628         {
 629           oprnd = gimple_arg (stmt_info->stmt, opno);
 630           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 631             backedge = dominated_by_p (CDI_DOMINATORS,
 632                                        gimple_phi_arg_edge (stmt, opno)->src,
 633                                        gimple_bb (stmt_info->stmt));
 634         }
 635       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 636         oprnd = TREE_OPERAND (oprnd, 0);
 637
 638       oprnd_info = (*oprnds_info)[i];
 639
 640       stmt_vec_info def_stmt_info;
 641       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 642         {
 643           if (dump_enabled_p ())
 644             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 645                              "Build SLP failed: can't analyze def for %T\n",
 646                              oprnd);
 647
 648           return -1;
 649         }
 650
 651       if (skip_args[i])
 652         {
 653           oprnd_info->def_stmts.quick_push (NULL);
 654           oprnd_info->ops.quick_push (NULL_TREE);
 655           oprnd_info->first_dt = vect_uninitialized_def;
 656           continue;
 657         }
 658
 659       oprnd_info->def_stmts.quick_push (def_stmt_info);
 660       oprnd_info->ops.quick_push (oprnd);
 661
 662       if (def_stmt_info
 663           && is_pattern_stmt_p (def_stmt_info))
 664         {
 665           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 666               != def_stmt_info)
 667             oprnd_info->any_pattern = true;
 668           else
 669             /* If we promote this to external use the original stmt def.  */
 670             oprnd_info->ops.last ()
 671               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 672         }
 673
 674       /* If there's a extern def on a backedge make sure we can
 675          code-generate at the region start.
 676          ???  This is another case that could be fixed by adjusting
 677          how we split the function but at the moment we'd have conflicting
 678          goals there.  */
 679       if (backedge
 680           && dts[i] == vect_external_def
 681           && is_a <bb_vec_info> (vinfo)
 682           && TREE_CODE (oprnd) == SSA_NAME
 683           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 684           && !dominated_by_p (CDI_DOMINATORS,
 685                               as_a <bb_vec_info> (vinfo)->bbs[0],
 686                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 687         {
 688           if (dump_enabled_p ())
 689             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 690                              "Build SLP failed: extern def %T only defined "
 691                              "on backedge\n", oprnd);
 692           return -1;
 693         }
 694
 695       if (first)
 696         {
 697           tree type = TREE_TYPE (oprnd);
 698           dt = dts[i];
 699           if ((dt == vect_constant_def
 700                || dt == vect_external_def)
 701               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 702               && (TREE_CODE (type) == BOOLEAN_TYPE
 703                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 704                                                       type)))
 705             {
 706               if (dump_enabled_p ())
 707                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 708                                  "Build SLP failed: invalid type of def "
 709                                  "for variable-length SLP %T\n", oprnd);
 710               return -1;
 711             }
 712
 713           /* For the swapping logic below force vect_reduction_def
 714              for the reduction op in a SLP reduction group.  */
 715           if (!STMT_VINFO_DATA_REF (stmt_info)
 716               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 717               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 718               && def_stmt_info)
 719             dts[i] = dt = vect_reduction_def;
 720
 721           /* Check the types of the definition.  */
 722           switch (dt)
 723             {
 724             case vect_external_def:
 725             case vect_constant_def:
 726             case vect_internal_def:
 727             case vect_reduction_def:
 728             case vect_induction_def:
 729             case vect_nested_cycle:
 730             case vect_first_order_recurrence:
 731               break;
 732
 733             default:
 734               /* FORNOW: Not supported.  */
 735               if (dump_enabled_p ())
 736                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 737                                  "Build SLP failed: illegal type of def %T\n",
 738                                  oprnd);
 739               return -1;
 740             }
 741
 742           oprnd_info->first_dt = dt;
 743           oprnd_info->first_op_type = type;
 744         }
 745     }
 746   if (first)
 747     return 0;
 748
 749   /* Now match the operand definition types to that of the first stmt.  */
 750   for (i = 0; i < number_of_oprnds;)
 751     {
 752       if (skip_args[i])
 753         {
 754           ++i;
 755           continue;
 756         }
 757
 758       oprnd_info = (*oprnds_info)[i];
 759       dt = dts[i];
 760       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 761       oprnd = oprnd_info->ops[stmt_num];
 762       tree type = TREE_TYPE (oprnd);
 763
 764       if (!types_compatible_p (oprnd_info->first_op_type, type))
 765         {
 766           if (dump_enabled_p ())
 767             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 768                              "Build SLP failed: different operand types\n");
 769           return 1;
 770         }
 771
 772       /* Not first stmt of the group, check that the def-stmt/s match
 773          the def-stmt/s of the first stmt.  Allow different definition
 774          types for reduction chains: the first stmt must be a
 775          vect_reduction_def (a phi node), and the rest
 776          end in the reduction chain.  */
 777       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 778            && !(oprnd_info->first_dt == vect_reduction_def
 779                 && !STMT_VINFO_DATA_REF (stmt_info)
 780                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 781                 && def_stmt_info
 782                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 783                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 784                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 785           || (!STMT_VINFO_DATA_REF (stmt_info)
 786               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 787               && ((!def_stmt_info
 788                    || STMT_VINFO_DATA_REF (def_stmt_info)
 789                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 790                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 791                   != (oprnd_info->first_dt != vect_reduction_def))))
 792         {
 793           /* Try swapping operands if we got a mismatch.  For BB
 794              vectorization only in case it will clearly improve things.  */
 795           if (i == commutative_op && !swapped
 796               && (!is_a <bb_vec_info> (vinfo)
 797                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 798                                              dts[i+1])
 799                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 800                           || vect_def_types_match
 801                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 802             {
 803               if (dump_enabled_p ())
 804                 dump_printf_loc (MSG_NOTE, vect_location,
 805                                  "trying swapped operands\n");
 806               std::swap (dts[i], dts[i+1]);
 807               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 808                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 809               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 810                          (*oprnds_info)[i+1]->ops[stmt_num]);
 811               swapped = true;
 812               continue;
 813             }
 814
 815           if (is_a <bb_vec_info> (vinfo)
 816               && !oprnd_info->any_pattern)
 817             {
 818               /* Now for commutative ops we should see whether we can
 819                  make the other operand matching.  */
 820               if (dump_enabled_p ())
 821                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                                  "treating operand as external\n");
 823               oprnd_info->first_dt = dt = vect_external_def;
 824             }
 825           else
 826             {
 827               if (dump_enabled_p ())
 828                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 829                                  "Build SLP failed: different types\n");
 830               return 1;
 831             }
 832         }
 833
 834       /* Make sure to demote the overall operand to external.  */
 835       if (dt == vect_external_def)
 836         oprnd_info->first_dt = vect_external_def;
 837       /* For a SLP reduction chain we want to duplicate the reduction to
 838          each of the chain members.  That gets us a sane SLP graph (still
 839          the stmts are not 100% correct wrt the initial values).  */
 840       else if ((dt == vect_internal_def
 841                 || dt == vect_reduction_def)
 842                && oprnd_info->first_dt == vect_reduction_def
 843                && !STMT_VINFO_DATA_REF (stmt_info)
 844                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 845                && !STMT_VINFO_DATA_REF (def_stmt_info)
 846                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 847                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 848         {
 849           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 850           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 851         }
 852
 853       ++i;
 854     }
 855
 856   /* Swap operands.  */
 857   if (swapped)
 858     {
 859       if (dump_enabled_p ())
 860         dump_printf_loc (MSG_NOTE, vect_location,
 861                          "swapped operands to match def types in %G",
 862                          stmt_info->stmt);
 863     }
 864
 865   return 0;
 866 }
 867
 868 /* Return true if call statements CALL1 and CALL2 are similar enough
 869    to be combined into the same SLP group.  */
 870
 871 bool
 872 compatible_calls_p (gcall *call1, gcall *call2)
 873 {
 874   unsigned int nargs = gimple_call_num_args (call1);
 875   if (nargs != gimple_call_num_args (call2))
 876     return false;
 877
 878   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 879     return false;
 880
 881   if (gimple_call_internal_p (call1))
 882     {
 883       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 884                                TREE_TYPE (gimple_call_lhs (call2))))
 885         return false;
 886       for (unsigned int i = 0; i < nargs; ++i)
 887         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 888                                  TREE_TYPE (gimple_call_arg (call2, i))))
 889           return false;
 890     }
 891   else
 892     {
 893       if (!operand_equal_p (gimple_call_fn (call1),
 894                             gimple_call_fn (call2), 0))
 895         return false;
 896
 897       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 898         return false;
 899     }
 900
 901   /* Check that any unvectorized arguments are equal.  */
 902   if (const int *map = vect_get_operand_map (call1))
 903     {
 904       unsigned int nkept = *map++;
 905       unsigned int mapi = 0;
 906       for (unsigned int i = 0; i < nargs; ++i)
 907         if (mapi < nkept && map[mapi] == int (i))
 908           mapi += 1;
 909         else if (!operand_equal_p (gimple_call_arg (call1, i),
 910                                    gimple_call_arg (call2, i)))
 911           return false;
 912     }
 913
 914   return true;
 915 }
 916
 917 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 918    caller's attempt to find the vector type in STMT_INFO with the narrowest
 919    element type.  Return true if VECTYPE is nonnull and if it is valid
 920    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 921    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 922    vect_build_slp_tree.  */
 923
 924 static bool
 925 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 926                         unsigned int group_size,
 927                         tree vectype, poly_uint64 *max_nunits)
 928 {
 929   if (!vectype)
 930     {
 931       if (dump_enabled_p ())
 932         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 933                          "Build SLP failed: unsupported data-type in %G\n",
 934                          stmt_info->stmt);
 935       /* Fatal mismatch.  */
 936       return false;
 937     }
 938
 939   /* If populating the vector type requires unrolling then fail
 940      before adjusting *max_nunits for basic-block vectorization.  */
 941   if (is_a <bb_vec_info> (vinfo)
 942       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 943     {
 944       if (dump_enabled_p ())
 945         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 946                          "Build SLP failed: unrolling required "
 947                          "in basic block SLP\n");
 948       /* Fatal mismatch.  */
 949       return false;
 950     }
 951
 952   /* In case of multiple types we need to detect the smallest type.  */
 953   vect_update_max_nunits (max_nunits, vectype);
 954   return true;
 955 }
 956
 957 /* Verify if the scalar stmts STMTS are isomorphic, require data
 958    permutation or are of unsupported types of operation.  Return
 959    true if they are, otherwise return false and indicate in *MATCHES
 960    which stmts are not isomorphic to the first one.  If MATCHES[0]
 961    is false then this indicates the comparison could not be
 962    carried out or the stmts will never be vectorized by SLP.
 963
 964    Note COND_EXPR is possibly isomorphic to another one after swapping its
 965    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 966    the first stmt by swapping the two operands of comparison; set SWAP[i]
 967    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 968    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 969    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 970
 971 static bool
 972 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 973                        vec<stmt_vec_info> stmts, unsigned int group_size,
 974                        poly_uint64 *max_nunits, bool *matches,
 975                        bool *two_operators, tree *node_vectype)
 976 {
 977   unsigned int i;
 978   stmt_vec_info first_stmt_info = stmts[0];
 979   code_helper first_stmt_code = ERROR_MARK;
 980   code_helper alt_stmt_code = ERROR_MARK;
 981   code_helper rhs_code = ERROR_MARK;
 982   code_helper first_cond_code = ERROR_MARK;
 983   tree lhs;
 984   bool need_same_oprnds = false;
 985   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 986   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 987   bool first_stmt_ldst_p = false, ldst_p = false;
 988   bool first_stmt_phi_p = false, phi_p = false;
 989   bool maybe_soft_fail = false;
 990   tree soft_fail_nunits_vectype = NULL_TREE;
 991
 992   /* For every stmt in NODE find its def stmt/s.  */
 993   stmt_vec_info stmt_info;
 994   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 995     {
 996       gimple *stmt = stmt_info->stmt;
 997       swap[i] = 0;
 998       matches[i] = false;
 999
1000       if (dump_enabled_p ())
1001         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1002
1003       /* Fail to vectorize statements marked as unvectorizable, throw
1004          or are volatile.  */
1005       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1006           || stmt_can_throw_internal (cfun, stmt)
1007           || gimple_has_volatile_ops (stmt))
1008         {
1009           if (dump_enabled_p ())
1010             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011                              "Build SLP failed: unvectorizable statement %G",
1012                              stmt);
1013           /* ???  For BB vectorization we want to commutate operands in a way
1014              to shuffle all unvectorizable defs into one operand and have
1015              the other still vectorized.  The following doesn't reliably
1016              work for this though but it's the easiest we can do here.  */
1017           if (is_a <bb_vec_info> (vinfo) && i != 0)
1018             continue;
1019           /* Fatal mismatch.  */
1020           matches[0] = false;
1021           return false;
1022         }
1023
1024       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1025       lhs = gimple_get_lhs (stmt);
1026       if (lhs == NULL_TREE
1027           && (!call_stmt
1028               || !gimple_call_internal_p (stmt)
1029               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1030         {
1031           if (dump_enabled_p ())
1032             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1034                              "GIMPLE_CALL %G", stmt);
1035           if (is_a <bb_vec_info> (vinfo) && i != 0)
1036             continue;
1037           /* Fatal mismatch.  */
1038           matches[0] = false;
1039           return false;
1040         }
1041
1042       tree nunits_vectype;
1043       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1044                                            &nunits_vectype, group_size))
1045         {
1046           if (is_a <bb_vec_info> (vinfo) && i != 0)
1047             continue;
1048           /* Fatal mismatch.  */
1049           matches[0] = false;
1050           return false;
1051         }
1052       /* Record nunits required but continue analysis, producing matches[]
1053          as if nunits was not an issue.  This allows splitting of groups
1054          to happen.  */
1055       if (nunits_vectype
1056           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1057                                       nunits_vectype, max_nunits))
1058         {
1059           gcc_assert (is_a <bb_vec_info> (vinfo));
1060           maybe_soft_fail = true;
1061           soft_fail_nunits_vectype = nunits_vectype;
1062         }
1063
1064       gcc_assert (vectype);
1065
1066       if (call_stmt)
1067         {
1068           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1069           if (cfn != CFN_LAST)
1070             rhs_code = cfn;
1071           else
1072             rhs_code = CALL_EXPR;
1073
1074           if (cfn == CFN_MASK_LOAD
1075               || cfn == CFN_GATHER_LOAD
1076               || cfn == CFN_MASK_GATHER_LOAD)
1077             ldst_p = true;
1078           else if (cfn == CFN_MASK_STORE)
1079             {
1080               ldst_p = true;
1081               rhs_code = CFN_MASK_STORE;
1082             }
1083           else if ((internal_fn_p (cfn)
1084                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1085                    || gimple_call_tail_p (call_stmt)
1086                    || gimple_call_noreturn_p (call_stmt)
1087                    || gimple_call_chain (call_stmt))
1088             {
1089               if (dump_enabled_p ())
1090                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                                  "Build SLP failed: unsupported call type %G",
1092                                  (gimple *) call_stmt);
1093               if (is_a <bb_vec_info> (vinfo) && i != 0)
1094                 continue;
1095               /* Fatal mismatch.  */
1096               matches[0] = false;
1097               return false;
1098             }
1099         }
1100       else if (gimple_code (stmt) == GIMPLE_PHI)
1101         {
1102           rhs_code = ERROR_MARK;
1103           phi_p = true;
1104         }
1105       else
1106         {
1107           rhs_code = gimple_assign_rhs_code (stmt);
1108           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1109         }
1110
1111       /* Check the operation.  */
1112       if (i == 0)
1113         {
1114           *node_vectype = vectype;
1115           first_stmt_code = rhs_code;
1116           first_stmt_ldst_p = ldst_p;
1117           first_stmt_phi_p = phi_p;
1118
1119           /* Shift arguments should be equal in all the packed stmts for a
1120              vector shift with scalar shift operand.  */
1121           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1122               || rhs_code == LROTATE_EXPR
1123               || rhs_code == RROTATE_EXPR)
1124             {
1125               /* First see if we have a vector/vector shift.  */
1126               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1127                 {
1128                   /* No vector/vector shift, try for a vector/scalar shift.  */
1129                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1130                     {
1131                       if (dump_enabled_p ())
1132                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                                          "Build SLP failed: "
1134                                          "op not supported by target.\n");
1135                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1136                         continue;
1137                       /* Fatal mismatch.  */
1138                       matches[0] = false;
1139                       return false;
1140                     }
1141                   need_same_oprnds = true;
1142                   first_op1 = gimple_assign_rhs2 (stmt);
1143                 }
1144             }
1145           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1146             {
1147               need_same_oprnds = true;
1148               first_op1 = gimple_assign_rhs2 (stmt);
1149             }
1150           else if (!ldst_p
1151                    && rhs_code == BIT_FIELD_REF)
1152             {
1153               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1154               if (!is_a <bb_vec_info> (vinfo)
1155                   || TREE_CODE (vec) != SSA_NAME
1156                   /* When the element types are not compatible we pun the
1157                      source to the target vectype which requires equal size.  */
1158                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1159                        || !types_compatible_p (TREE_TYPE (vectype),
1160                                                TREE_TYPE (TREE_TYPE (vec))))
1161                       && !operand_equal_p (TYPE_SIZE (vectype),
1162                                            TYPE_SIZE (TREE_TYPE (vec)))))
1163                 {
1164                   if (dump_enabled_p ())
1165                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166                                      "Build SLP failed: "
1167                                      "BIT_FIELD_REF not supported\n");
1168                   /* Fatal mismatch.  */
1169                   matches[0] = false;
1170                   return false;
1171                 }
1172             }
1173           else if (rhs_code == CFN_DIV_POW2)
1174             {
1175               need_same_oprnds = true;
1176               first_op1 = gimple_call_arg (call_stmt, 1);
1177             }
1178         }
1179       else
1180         {
1181           if (first_stmt_code != rhs_code
1182               && alt_stmt_code == ERROR_MARK)
1183             alt_stmt_code = rhs_code;
1184           if ((first_stmt_code != rhs_code
1185                && (first_stmt_code != IMAGPART_EXPR
1186                    || rhs_code != REALPART_EXPR)
1187                && (first_stmt_code != REALPART_EXPR
1188                    || rhs_code != IMAGPART_EXPR)
1189                /* Handle mismatches in plus/minus by computing both
1190                   and merging the results.  */
1191                && !((first_stmt_code == PLUS_EXPR
1192                      || first_stmt_code == MINUS_EXPR)
1193                     && (alt_stmt_code == PLUS_EXPR
1194                         || alt_stmt_code == MINUS_EXPR)
1195                     && rhs_code == alt_stmt_code)
1196                && !(first_stmt_code.is_tree_code ()
1197                     && rhs_code.is_tree_code ()
1198                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1199                         == tcc_comparison)
1200                     && (swap_tree_comparison (tree_code (first_stmt_code))
1201                         == tree_code (rhs_code)))
1202                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1203                     && (first_stmt_code == ARRAY_REF
1204                         || first_stmt_code == BIT_FIELD_REF
1205                         || first_stmt_code == INDIRECT_REF
1206                         || first_stmt_code == COMPONENT_REF
1207                         || first_stmt_code == MEM_REF)
1208                     && (rhs_code == ARRAY_REF
1209                         || rhs_code == BIT_FIELD_REF
1210                         || rhs_code == INDIRECT_REF
1211                         || rhs_code == COMPONENT_REF
1212                         || rhs_code == MEM_REF)))
1213               || first_stmt_ldst_p != ldst_p
1214               || first_stmt_phi_p != phi_p)
1215             {
1216               if (dump_enabled_p ())
1217                 {
1218                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                                    "Build SLP failed: different operation "
1220                                    "in stmt %G", stmt);
1221                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                                    "original stmt %G", first_stmt_info->stmt);
1223                 }
1224               /* Mismatch.  */
1225               continue;
1226             }
1227
1228           if (!ldst_p
1229               && first_stmt_code == BIT_FIELD_REF
1230               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1231                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1232             {
1233               if (dump_enabled_p ())
1234                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                                  "Build SLP failed: different BIT_FIELD_REF "
1236                                  "arguments in %G", stmt);
1237               /* Mismatch.  */
1238               continue;
1239             }
1240
1241           if (call_stmt
1242               && first_stmt_code != CFN_MASK_LOAD
1243               && first_stmt_code != CFN_MASK_STORE)
1244             {
1245               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1246                                        call_stmt))
1247                 {
1248                   if (dump_enabled_p ())
1249                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                                      "Build SLP failed: different calls in %G",
1251                                      stmt);
1252                   /* Mismatch.  */
1253                   continue;
1254                 }
1255             }
1256
1257           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1258               && (gimple_bb (first_stmt_info->stmt)
1259                   != gimple_bb (stmt_info->stmt)))
1260             {
1261               if (dump_enabled_p ())
1262                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                                  "Build SLP failed: different BB for PHI "
1264                                  "or possibly trapping operation in %G", stmt);
1265               /* Mismatch.  */
1266               continue;
1267             }
1268
1269           if (need_same_oprnds)
1270             {
1271               tree other_op1 = gimple_arg (stmt, 1);
1272               if (!operand_equal_p (first_op1, other_op1, 0))
1273                 {
1274                   if (dump_enabled_p ())
1275                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                                      "Build SLP failed: different shift "
1277                                      "arguments in %G", stmt);
1278                   /* Mismatch.  */
1279                   continue;
1280                 }
1281             }
1282
1283           if (!types_compatible_p (vectype, *node_vectype))
1284             {
1285               if (dump_enabled_p ())
1286                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1287                                  "Build SLP failed: different vector type "
1288                                  "in %G", stmt);
1289               /* Mismatch.  */
1290               continue;
1291             }
1292         }
1293
1294       /* Grouped store or load.  */
1295       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1296         {
1297           gcc_assert (ldst_p);
1298           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1299             {
1300               /* Store.  */
1301               gcc_assert (rhs_code == CFN_MASK_STORE
1302                           || REFERENCE_CLASS_P (lhs)
1303                           || DECL_P (lhs));
1304             }
1305           else
1306             {
1307               /* Load.  */
1308               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1309               if (prev_first_load)
1310                 {
1311                   /* Check that there are no loads from different interleaving
1312                      chains in the same node.  */
1313                   if (prev_first_load != first_load)
1314                     {
1315                       if (dump_enabled_p ())
1316                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1317                                          vect_location,
1318                                          "Build SLP failed: different "
1319                                          "interleaving chains in one node %G",
1320                                          stmt);
1321                       /* Mismatch.  */
1322                       continue;
1323                     }
1324                 }
1325               else
1326                 prev_first_load = first_load;
1327            }
1328         }
1329       /* Non-grouped store or load.  */
1330       else if (ldst_p)
1331         {
1332           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1333               && rhs_code != CFN_GATHER_LOAD
1334               && rhs_code != CFN_MASK_GATHER_LOAD
1335               /* Not grouped loads are handled as externals for BB
1336                  vectorization.  For loop vectorization we can handle
1337                  splats the same we handle single element interleaving.  */
1338               && (is_a <bb_vec_info> (vinfo)
1339                   || stmt_info != first_stmt_info
1340                   || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1341             {
1342               /* Not grouped load.  */
1343               if (dump_enabled_p ())
1344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                                  "Build SLP failed: not grouped load %G", stmt);
1346
1347               if (i != 0)
1348                 continue;
1349               /* Fatal mismatch.  */
1350               matches[0] = false;
1351               return false;
1352             }
1353         }
1354       /* Not memory operation.  */
1355       else
1356         {
1357           if (!phi_p
1358               && rhs_code.is_tree_code ()
1359               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1360               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1361               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1362               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1363               && rhs_code != VIEW_CONVERT_EXPR
1364               && rhs_code != CALL_EXPR
1365               && rhs_code != BIT_FIELD_REF)
1366             {
1367               if (dump_enabled_p ())
1368                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1369                                  "Build SLP failed: operation unsupported %G",
1370                                  stmt);
1371               if (is_a <bb_vec_info> (vinfo) && i != 0)
1372                 continue;
1373               /* Fatal mismatch.  */
1374               matches[0] = false;
1375               return false;
1376             }
1377
1378           if (rhs_code == COND_EXPR)
1379             {
1380               tree cond_expr = gimple_assign_rhs1 (stmt);
1381               enum tree_code cond_code = TREE_CODE (cond_expr);
1382               enum tree_code swap_code = ERROR_MARK;
1383               enum tree_code invert_code = ERROR_MARK;
1384
1385               if (i == 0)
1386                 first_cond_code = TREE_CODE (cond_expr);
1387               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1388                 {
1389                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1390                   swap_code = swap_tree_comparison (cond_code);
1391                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1392                 }
1393
1394               if (first_cond_code == cond_code)
1395                 ;
1396               /* Isomorphic can be achieved by swapping.  */
1397               else if (first_cond_code == swap_code)
1398                 swap[i] = 1;
1399               /* Isomorphic can be achieved by inverting.  */
1400               else if (first_cond_code == invert_code)
1401                 swap[i] = 2;
1402               else
1403                 {
1404                   if (dump_enabled_p ())
1405                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406                                      "Build SLP failed: different"
1407                                      " operation %G", stmt);
1408                   /* Mismatch.  */
1409                   continue;
1410                 }
1411             }
1412
1413           if (rhs_code.is_tree_code ()
1414               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1415               && (swap_tree_comparison ((tree_code)first_stmt_code)
1416                   == (tree_code)rhs_code))
1417             swap[i] = 1;
1418         }
1419
1420       matches[i] = true;
1421     }
1422
1423   for (i = 0; i < group_size; ++i)
1424     if (!matches[i])
1425       return false;
1426
1427   /* If we allowed a two-operation SLP node verify the target can cope
1428      with the permute we are going to use.  */
1429   if (alt_stmt_code != ERROR_MARK
1430       && (!alt_stmt_code.is_tree_code ()
1431           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1432               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1433     {
1434       *two_operators = true;
1435     }
1436
1437   if (maybe_soft_fail)
1438     {
1439       unsigned HOST_WIDE_INT const_nunits;
1440       if (!TYPE_VECTOR_SUBPARTS
1441             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1442           || const_nunits > group_size)
1443         matches[0] = false;
1444       else
1445         {
1446           /* With constant vector elements simulate a mismatch at the
1447              point we need to split.  */
1448           unsigned tail = group_size & (const_nunits - 1);
1449           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1450         }
1451       return false;
1452     }
1453
1454   return true;
1455 }
1456
1457 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1458    Note we never remove apart from at destruction time so we do not
1459    need a special value for deleted that differs from empty.  */
1460 struct bst_traits
1461 {
1462   typedef vec <stmt_vec_info> value_type;
1463   typedef vec <stmt_vec_info> compare_type;
1464   static inline hashval_t hash (value_type);
1465   static inline bool equal (value_type existing, value_type candidate);
1466   static inline bool is_empty (value_type x) { return !x.exists (); }
1467   static inline bool is_deleted (value_type x) { return !x.exists (); }
1468   static const bool empty_zero_p = true;
1469   static inline void mark_empty (value_type &x) { x.release (); }
1470   static inline void mark_deleted (value_type &x) { x.release (); }
1471   static inline void remove (value_type &x) { x.release (); }
1472 };
1473 inline hashval_t
1474 bst_traits::hash (value_type x)
1475 {
1476   inchash::hash h;
1477   for (unsigned i = 0; i < x.length (); ++i)
1478     h.add_int (gimple_uid (x[i]->stmt));
1479   return h.end ();
1480 }
1481 inline bool
1482 bst_traits::equal (value_type existing, value_type candidate)
1483 {
1484   if (existing.length () != candidate.length ())
1485     return false;
1486   for (unsigned i = 0; i < existing.length (); ++i)
1487     if (existing[i] != candidate[i])
1488       return false;
1489   return true;
1490 }
1491
1492 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1493    but then vec::insert does memmove and that's not compatible with
1494    std::pair.  */
1495 struct chain_op_t
1496 {
1497   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1498       : code (code_), dt (dt_), op (op_) {}
1499   tree_code code;
1500   vect_def_type dt;
1501   tree op;
1502 };
1503
1504 /* Comparator for sorting associatable chains.  */
1505
1506 static int
1507 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1508 {
1509   auto *op1 = (const chain_op_t *) op1_;
1510   auto *op2 = (const chain_op_t *) op2_;
1511   if (op1->dt != op2->dt)
1512     return (int)op1->dt - (int)op2->dt;
1513   return (int)op1->code - (int)op2->code;
1514 }
1515
1516 /* Linearize the associatable expression chain at START with the
1517    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1518    filling CHAIN with the result and using WORKLIST as intermediate storage.
1519    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1520    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1521    stmts, starting with START.  */
1522
1523 static void
1524 vect_slp_linearize_chain (vec_info *vinfo,
1525                           vec<std::pair<tree_code, gimple *> > &worklist,
1526                           vec<chain_op_t> &chain,
1527                           enum tree_code code, gimple *start,
1528                           gimple *&code_stmt, gimple *&alt_code_stmt,
1529                           vec<gimple *> *chain_stmts)
1530 {
1531   /* For each lane linearize the addition/subtraction (or other
1532      uniform associatable operation) expression tree.  */
1533   worklist.safe_push (std::make_pair (code, start));
1534   while (!worklist.is_empty ())
1535     {
1536       auto entry = worklist.pop ();
1537       gassign *stmt = as_a <gassign *> (entry.second);
1538       enum tree_code in_code = entry.first;
1539       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1540       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1541       if (!code_stmt
1542           && gimple_assign_rhs_code (stmt) == code)
1543         code_stmt = stmt;
1544       else if (!alt_code_stmt
1545                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1546         alt_code_stmt = stmt;
1547       if (chain_stmts)
1548         chain_stmts->safe_push (stmt);
1549       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1550         {
1551           tree op = gimple_op (stmt, opnum);
1552           vect_def_type dt;
1553           stmt_vec_info def_stmt_info;
1554           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1555           gcc_assert (res);
1556           if (dt == vect_internal_def
1557               && is_pattern_stmt_p (def_stmt_info))
1558             op = gimple_get_lhs (def_stmt_info->stmt);
1559           gimple *use_stmt;
1560           use_operand_p use_p;
1561           if (dt == vect_internal_def
1562               && single_imm_use (op, &use_p, &use_stmt)
1563               && is_gimple_assign (def_stmt_info->stmt)
1564               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1565                   || (code == PLUS_EXPR
1566                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1567                           == MINUS_EXPR))))
1568             {
1569               tree_code op_def_code = this_code;
1570               if (op_def_code == MINUS_EXPR && opnum == 1)
1571                 op_def_code = PLUS_EXPR;
1572               if (in_code == MINUS_EXPR)
1573                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1574               worklist.safe_push (std::make_pair (op_def_code,
1575                                                   def_stmt_info->stmt));
1576             }
1577           else
1578             {
1579               tree_code op_def_code = this_code;
1580               if (op_def_code == MINUS_EXPR && opnum == 1)
1581                 op_def_code = PLUS_EXPR;
1582               if (in_code == MINUS_EXPR)
1583                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1584               chain.safe_push (chain_op_t (op_def_code, dt, op));
1585             }
1586         }
1587     }
1588 }
1589
1590 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1591                   simple_hashmap_traits <bst_traits, slp_tree> >
1592   scalar_stmts_to_slp_tree_map_t;
1593
1594 static slp_tree
1595 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1596                        vec<stmt_vec_info> stmts, unsigned int group_size,
1597                        poly_uint64 *max_nunits,
1598                        bool *matches, unsigned *limit, unsigned *tree_size,
1599                        scalar_stmts_to_slp_tree_map_t *bst_map);
1600
1601 static slp_tree
1602 vect_build_slp_tree (vec_info *vinfo,
1603                      vec<stmt_vec_info> stmts, unsigned int group_size,
1604                      poly_uint64 *max_nunits,
1605                      bool *matches, unsigned *limit, unsigned *tree_size,
1606                      scalar_stmts_to_slp_tree_map_t *bst_map)
1607 {
1608   if (slp_tree *leader = bst_map->get (stmts))
1609     {
1610       if (dump_enabled_p ())
1611         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1612                          !(*leader)->failed ? "" : "failed ",
1613                          (void *) *leader);
1614       if (!(*leader)->failed)
1615         {
1616           SLP_TREE_REF_COUNT (*leader)++;
1617           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1618           stmts.release ();
1619           return *leader;
1620         }
1621       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1622       return NULL;
1623     }
1624
1625   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1626      so we can pick up backedge destinations during discovery.  */
1627   slp_tree res = new _slp_tree;
1628   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1629   SLP_TREE_SCALAR_STMTS (res) = stmts;
1630   bst_map->put (stmts.copy (), res);
1631
1632   if (*limit == 0)
1633     {
1634       if (dump_enabled_p ())
1635         dump_printf_loc (MSG_NOTE, vect_location,
1636                          "SLP discovery limit exceeded\n");
1637       /* Mark the node invalid so we can detect those when still in use
1638          as backedge destinations.  */
1639       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1640       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1641       res->failed = XNEWVEC (bool, group_size);
1642       memset (res->failed, 0, sizeof (bool) * group_size);
1643       memset (matches, 0, sizeof (bool) * group_size);
1644       return NULL;
1645     }
1646   --*limit;
1647
1648   if (dump_enabled_p ())
1649     dump_printf_loc (MSG_NOTE, vect_location,
1650                      "starting SLP discovery for node %p\n", (void *) res);
1651
1652   poly_uint64 this_max_nunits = 1;
1653   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1654                                         &this_max_nunits,
1655                                         matches, limit, tree_size, bst_map);
1656   if (!res_)
1657     {
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_NOTE, vect_location,
1660                          "SLP discovery for node %p failed\n", (void *) res);
1661       /* Mark the node invalid so we can detect those when still in use
1662          as backedge destinations.  */
1663       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1664       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1665       res->failed = XNEWVEC (bool, group_size);
1666       if (flag_checking)
1667         {
1668           unsigned i;
1669           for (i = 0; i < group_size; ++i)
1670             if (!matches[i])
1671               break;
1672           gcc_assert (i < group_size);
1673         }
1674       memcpy (res->failed, matches, sizeof (bool) * group_size);
1675     }
1676   else
1677     {
1678       if (dump_enabled_p ())
1679         dump_printf_loc (MSG_NOTE, vect_location,
1680                          "SLP discovery for node %p succeeded\n",
1681                          (void *) res);
1682       gcc_assert (res_ == res);
1683       res->max_nunits = this_max_nunits;
1684       vect_update_max_nunits (max_nunits, this_max_nunits);
1685       /* Keep a reference for the bst_map use.  */
1686       SLP_TREE_REF_COUNT (res)++;
1687     }
1688   return res_;
1689 }
1690
1691 /* Helper for building an associated SLP node chain.  */
1692
1693 static void
1694 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1695                                    slp_tree op0, slp_tree op1,
1696                                    stmt_vec_info oper1, stmt_vec_info oper2,
1697                                    vec<std::pair<unsigned, unsigned> > lperm)
1698 {
1699   unsigned group_size = SLP_TREE_LANES (op1);
1700
1701   slp_tree child1 = new _slp_tree;
1702   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1703   SLP_TREE_VECTYPE (child1) = vectype;
1704   SLP_TREE_LANES (child1) = group_size;
1705   SLP_TREE_CHILDREN (child1).create (2);
1706   SLP_TREE_CHILDREN (child1).quick_push (op0);
1707   SLP_TREE_CHILDREN (child1).quick_push (op1);
1708   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1709
1710   slp_tree child2 = new _slp_tree;
1711   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1712   SLP_TREE_VECTYPE (child2) = vectype;
1713   SLP_TREE_LANES (child2) = group_size;
1714   SLP_TREE_CHILDREN (child2).create (2);
1715   SLP_TREE_CHILDREN (child2).quick_push (op0);
1716   SLP_TREE_REF_COUNT (op0)++;
1717   SLP_TREE_CHILDREN (child2).quick_push (op1);
1718   SLP_TREE_REF_COUNT (op1)++;
1719   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1720
1721   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1722   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1723   SLP_TREE_VECTYPE (perm) = vectype;
1724   SLP_TREE_LANES (perm) = group_size;
1725   /* ???  We should set this NULL but that's not expected.  */
1726   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1727   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1728   SLP_TREE_CHILDREN (perm).quick_push (child1);
1729   SLP_TREE_CHILDREN (perm).quick_push (child2);
1730 }
1731
1732 /* Recursively build an SLP tree starting from NODE.
1733    Fail (and return a value not equal to zero) if def-stmts are not
1734    isomorphic, require data permutation or are of unsupported types of
1735    operation.  Otherwise, return 0.
1736    The value returned is the depth in the SLP tree where a mismatch
1737    was found.  */
1738
1739 static slp_tree
1740 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1741                        vec<stmt_vec_info> stmts, unsigned int group_size,
1742                        poly_uint64 *max_nunits,
1743                        bool *matches, unsigned *limit, unsigned *tree_size,
1744                        scalar_stmts_to_slp_tree_map_t *bst_map)
1745 {
1746   unsigned nops, i, this_tree_size = 0;
1747   poly_uint64 this_max_nunits = *max_nunits;
1748
1749   matches[0] = false;
1750
1751   stmt_vec_info stmt_info = stmts[0];
1752   if (!is_a<gcall *> (stmt_info->stmt)
1753       && !is_a<gassign *> (stmt_info->stmt)
1754       && !is_a<gphi *> (stmt_info->stmt))
1755     return NULL;
1756
1757   nops = gimple_num_args (stmt_info->stmt);
1758   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1759     nops = map[0];
1760
1761   /* If the SLP node is a PHI (induction or reduction), terminate
1762      the recursion.  */
1763   bool *skip_args = XALLOCAVEC (bool, nops);
1764   memset (skip_args, 0, sizeof (bool) * nops);
1765   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1766     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1767       {
1768         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1769         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1770                                                     group_size);
1771         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1772                                      max_nunits))
1773           return NULL;
1774
1775         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1776         if (def_type == vect_induction_def)
1777           {
1778             /* Induction PHIs are not cycles but walk the initial
1779                value.  Only for inner loops through, for outer loops
1780                we need to pick up the value from the actual PHIs
1781                to more easily support peeling and epilogue vectorization.  */
1782             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1783             if (!nested_in_vect_loop_p (loop, stmt_info))
1784               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1785             else
1786               loop = loop->inner;
1787             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1788           }
1789         else if (def_type == vect_reduction_def
1790                  || def_type == vect_double_reduction_def
1791                  || def_type == vect_nested_cycle
1792                  || def_type == vect_first_order_recurrence)
1793           {
1794             /* Else def types have to match.  */
1795             stmt_vec_info other_info;
1796             bool all_same = true;
1797             FOR_EACH_VEC_ELT (stmts, i, other_info)
1798               {
1799                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1800                   return NULL;
1801                 if (other_info != stmt_info)
1802                   all_same = false;
1803               }
1804             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1805             /* Reduction initial values are not explicitely represented.  */
1806             if (def_type != vect_first_order_recurrence
1807                 && !nested_in_vect_loop_p (loop, stmt_info))
1808               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1809             /* Reduction chain backedge defs are filled manually.
1810                ???  Need a better way to identify a SLP reduction chain PHI.
1811                Or a better overall way to SLP match those.  */
1812             if (all_same && def_type == vect_reduction_def)
1813               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1814           }
1815         else if (def_type != vect_internal_def)
1816           return NULL;
1817       }
1818
1819
1820   bool two_operators = false;
1821   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1822   tree vectype = NULL_TREE;
1823   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1824                               &this_max_nunits, matches, &two_operators,
1825                               &vectype))
1826     return NULL;
1827
1828   /* If the SLP node is a load, terminate the recursion unless masked.  */
1829   if (STMT_VINFO_DATA_REF (stmt_info)
1830       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1831     {
1832       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1833         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1834                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1835                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1836       else
1837         {
1838           *max_nunits = this_max_nunits;
1839           (*tree_size)++;
1840           node = vect_create_new_slp_node (node, stmts, 0);
1841           SLP_TREE_VECTYPE (node) = vectype;
1842           /* And compute the load permutation.  Whether it is actually
1843              a permutation depends on the unrolling factor which is
1844              decided later.  */
1845           vec<unsigned> load_permutation;
1846           int j;
1847           stmt_vec_info load_info;
1848           load_permutation.create (group_size);
1849           stmt_vec_info first_stmt_info
1850             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1851           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1852             {
1853               int load_place;
1854               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1855                 load_place = vect_get_place_in_interleaving_chain
1856                                 (load_info, first_stmt_info);
1857               else
1858                 load_place = 0;
1859               gcc_assert (load_place != -1);
1860               load_permutation.safe_push (load_place);
1861             }
1862           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1863           return node;
1864         }
1865     }
1866   else if (gimple_assign_single_p (stmt_info->stmt)
1867            && !gimple_vuse (stmt_info->stmt)
1868            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1869     {
1870       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1871          the same SSA name vector of a compatible type to vectype.  */
1872       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1873       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1874       stmt_vec_info estmt_info;
1875       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1876         {
1877           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1878           tree bfref = gimple_assign_rhs1 (estmt);
1879           HOST_WIDE_INT lane;
1880           if (!known_eq (bit_field_size (bfref),
1881                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1882               || !constant_multiple_p (bit_field_offset (bfref),
1883                                        bit_field_size (bfref), &lane))
1884             {
1885               lperm.release ();
1886               matches[0] = false;
1887               return NULL;
1888             }
1889           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1890         }
1891       slp_tree vnode = vect_create_new_slp_node (vNULL);
1892       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1893         /* ???  We record vectype here but we hide eventually necessary
1894            punning and instead rely on code generation to materialize
1895            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1896            this explicit somehow.  */
1897         SLP_TREE_VECTYPE (vnode) = vectype;
1898       else
1899         {
1900           /* For different size but compatible elements we can still
1901              use VEC_PERM_EXPR without punning.  */
1902           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1903                       && types_compatible_p (TREE_TYPE (vectype),
1904                                              TREE_TYPE (TREE_TYPE (vec))));
1905           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1906         }
1907       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1908       unsigned HOST_WIDE_INT const_nunits;
1909       if (nunits.is_constant (&const_nunits))
1910         SLP_TREE_LANES (vnode) = const_nunits;
1911       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1912       /* We are always building a permutation node even if it is an identity
1913          permute to shield the rest of the vectorizer from the odd node
1914          representing an actual vector without any scalar ops.
1915          ???  We could hide it completely with making the permute node
1916          external?  */
1917       node = vect_create_new_slp_node (node, stmts, 1);
1918       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1919       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1920       SLP_TREE_VECTYPE (node) = vectype;
1921       SLP_TREE_CHILDREN (node).quick_push (vnode);
1922       return node;
1923     }
1924   /* When discovery reaches an associatable operation see whether we can
1925      improve that to match up lanes in a way superior to the operand
1926      swapping code which at most looks at two defs.
1927      ???  For BB vectorization we cannot do the brute-force search
1928      for matching as we can succeed by means of builds from scalars
1929      and have no good way to "cost" one build against another.  */
1930   else if (is_a <loop_vec_info> (vinfo)
1931            /* ???  We don't handle !vect_internal_def defs below.  */
1932            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1933            && is_gimple_assign (stmt_info->stmt)
1934            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1935                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1936            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1937                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1938                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1939     {
1940       /* See if we have a chain of (mixed) adds or subtracts or other
1941          associatable ops.  */
1942       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1943       if (code == MINUS_EXPR)
1944         code = PLUS_EXPR;
1945       stmt_vec_info other_op_stmt_info = NULL;
1946       stmt_vec_info op_stmt_info = NULL;
1947       unsigned chain_len = 0;
1948       auto_vec<chain_op_t> chain;
1949       auto_vec<std::pair<tree_code, gimple *> > worklist;
1950       auto_vec<vec<chain_op_t> > chains (group_size);
1951       auto_vec<slp_tree, 4> children;
1952       bool hard_fail = true;
1953       for (unsigned lane = 0; lane < group_size; ++lane)
1954         {
1955           /* For each lane linearize the addition/subtraction (or other
1956              uniform associatable operation) expression tree.  */
1957           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1958           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1959                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1960                                     NULL);
1961           if (!op_stmt_info && op_stmt)
1962             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1963           if (!other_op_stmt_info && other_op_stmt)
1964             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1965           if (chain.length () == 2)
1966             {
1967               /* In a chain of just two elements resort to the regular
1968                  operand swapping scheme.  If we run into a length
1969                  mismatch still hard-FAIL.  */
1970               if (chain_len == 0)
1971                 hard_fail = false;
1972               else
1973                 {
1974                   matches[lane] = false;
1975                   /* ???  We might want to process the other lanes, but
1976                      make sure to not give false matching hints to the
1977                      caller for lanes we did not process.  */
1978                   if (lane != group_size - 1)
1979                     matches[0] = false;
1980                 }
1981               break;
1982             }
1983           else if (chain_len == 0)
1984             chain_len = chain.length ();
1985           else if (chain.length () != chain_len)
1986             {
1987               /* ???  Here we could slip in magic to compensate with
1988                  neutral operands.  */
1989               matches[lane] = false;
1990               if (lane != group_size - 1)
1991                 matches[0] = false;
1992               break;
1993             }
1994           chains.quick_push (chain.copy ());
1995           chain.truncate (0);
1996         }
1997       if (chains.length () == group_size)
1998         {
1999           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2000           if (!op_stmt_info)
2001             {
2002               hard_fail = false;
2003               goto out;
2004             }
2005           /* Now we have a set of chains with the same length.  */
2006           /* 1. pre-sort according to def_type and operation.  */
2007           for (unsigned lane = 0; lane < group_size; ++lane)
2008             chains[lane].stablesort (dt_sort_cmp, vinfo);
2009           if (dump_enabled_p ())
2010             {
2011               dump_printf_loc (MSG_NOTE, vect_location,
2012                                "pre-sorted chains of %s\n",
2013                                get_tree_code_name (code));
2014               for (unsigned lane = 0; lane < group_size; ++lane)
2015                 {
2016                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2017                     dump_printf (MSG_NOTE, "%s %T ",
2018                                  get_tree_code_name (chains[lane][opnum].code),
2019                                  chains[lane][opnum].op);
2020                   dump_printf (MSG_NOTE, "\n");
2021                 }
2022             }
2023           /* 2. try to build children nodes, associating as necessary.  */
2024           for (unsigned n = 0; n < chain_len; ++n)
2025             {
2026               vect_def_type dt = chains[0][n].dt;
2027               unsigned lane;
2028               for (lane = 0; lane < group_size; ++lane)
2029                 if (chains[lane][n].dt != dt)
2030                   {
2031                     if (dt == vect_constant_def
2032                         && chains[lane][n].dt == vect_external_def)
2033                       dt = vect_external_def;
2034                     else if (dt == vect_external_def
2035                              && chains[lane][n].dt == vect_constant_def)
2036                       ;
2037                     else
2038                       break;
2039                   }
2040               if (lane != group_size)
2041                 {
2042                   if (dump_enabled_p ())
2043                     dump_printf_loc (MSG_NOTE, vect_location,
2044                                      "giving up on chain due to mismatched "
2045                                      "def types\n");
2046                   matches[lane] = false;
2047                   if (lane != group_size - 1)
2048                     matches[0] = false;
2049                   goto out;
2050                 }
2051               if (dt == vect_constant_def
2052                   || dt == vect_external_def)
2053                 {
2054                   /* Check whether we can build the invariant.  If we can't
2055                      we never will be able to.  */
2056                   tree type = TREE_TYPE (chains[0][n].op);
2057                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2058                       && (TREE_CODE (type) == BOOLEAN_TYPE
2059                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2060                                                               type)))
2061                     {
2062                       matches[0] = false;
2063                       goto out;
2064                     }
2065                   vec<tree> ops;
2066                   ops.create (group_size);
2067                   for (lane = 0; lane < group_size; ++lane)
2068                     ops.quick_push (chains[lane][n].op);
2069                   slp_tree child = vect_create_new_slp_node (ops);
2070                   SLP_TREE_DEF_TYPE (child) = dt;
2071                   children.safe_push (child);
2072                 }
2073               else if (dt != vect_internal_def)
2074                 {
2075                   /* Not sure, we might need sth special.
2076                      gcc.dg/vect/pr96854.c,
2077                      gfortran.dg/vect/fast-math-pr37021.f90
2078                      and gfortran.dg/vect/pr61171.f trigger.  */
2079                   /* Soft-fail for now.  */
2080                   hard_fail = false;
2081                   goto out;
2082                 }
2083               else
2084                 {
2085                   vec<stmt_vec_info> op_stmts;
2086                   op_stmts.create (group_size);
2087                   slp_tree child = NULL;
2088                   /* Brute-force our way.  We have to consider a lane
2089                      failing after fixing an earlier fail up in the
2090                      SLP discovery recursion.  So track the current
2091                      permute per lane.  */
2092                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2093                   memset (perms, 0, sizeof (unsigned) * group_size);
2094                   do
2095                     {
2096                       op_stmts.truncate (0);
2097                       for (lane = 0; lane < group_size; ++lane)
2098                         op_stmts.quick_push
2099                           (vinfo->lookup_def (chains[lane][n].op));
2100                       child = vect_build_slp_tree (vinfo, op_stmts,
2101                                                    group_size, &this_max_nunits,
2102                                                    matches, limit,
2103                                                    &this_tree_size, bst_map);
2104                       /* ???  We're likely getting too many fatal mismatches
2105                          here so maybe we want to ignore them (but then we
2106                          have no idea which lanes fatally mismatched).  */
2107                       if (child || !matches[0])
2108                         break;
2109                       /* Swap another lane we have not yet matched up into
2110                          lanes that did not match.  If we run out of
2111                          permute possibilities for a lane terminate the
2112                          search.  */
2113                       bool term = false;
2114                       for (lane = 1; lane < group_size; ++lane)
2115                         if (!matches[lane])
2116                           {
2117                             if (n + perms[lane] + 1 == chain_len)
2118                               {
2119                                 term = true;
2120                                 break;
2121                               }
2122                             std::swap (chains[lane][n],
2123                                        chains[lane][n + perms[lane] + 1]);
2124                             perms[lane]++;
2125                           }
2126                       if (term)
2127                         break;
2128                     }
2129                   while (1);
2130                   if (!child)
2131                     {
2132                       if (dump_enabled_p ())
2133                         dump_printf_loc (MSG_NOTE, vect_location,
2134                                          "failed to match up op %d\n", n);
2135                       op_stmts.release ();
2136                       if (lane != group_size - 1)
2137                         matches[0] = false;
2138                       else
2139                         matches[lane] = false;
2140                       goto out;
2141                     }
2142                   if (dump_enabled_p ())
2143                     {
2144                       dump_printf_loc (MSG_NOTE, vect_location,
2145                                        "matched up op %d to\n", n);
2146                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2147                     }
2148                   children.safe_push (child);
2149                 }
2150             }
2151           /* 3. build SLP nodes to combine the chain.  */
2152           for (unsigned lane = 0; lane < group_size; ++lane)
2153             if (chains[lane][0].code != code)
2154               {
2155                 /* See if there's any alternate all-PLUS entry.  */
2156                 unsigned n;
2157                 for (n = 1; n < chain_len; ++n)
2158                   {
2159                     for (lane = 0; lane < group_size; ++lane)
2160                       if (chains[lane][n].code != code)
2161                         break;
2162                     if (lane == group_size)
2163                       break;
2164                   }
2165                 if (n != chain_len)
2166                   {
2167                     /* Swap that in at first position.  */
2168                     std::swap (children[0], children[n]);
2169                     for (lane = 0; lane < group_size; ++lane)
2170                       std::swap (chains[lane][0], chains[lane][n]);
2171                   }
2172                 else
2173                   {
2174                     /* ???  When this triggers and we end up with two
2175                        vect_constant/external_def up-front things break (ICE)
2176                        spectacularly finding an insertion place for the
2177                        all-constant op.  We should have a fully
2178                        vect_internal_def operand though(?) so we can swap
2179                        that into first place and then prepend the all-zero
2180                        constant.  */
2181                     if (dump_enabled_p ())
2182                       dump_printf_loc (MSG_NOTE, vect_location,
2183                                        "inserting constant zero to compensate "
2184                                        "for (partially) negated first "
2185                                        "operand\n");
2186                     chain_len++;
2187                     for (lane = 0; lane < group_size; ++lane)
2188                       chains[lane].safe_insert
2189                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2190                     vec<tree> zero_ops;
2191                     zero_ops.create (group_size);
2192                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2193                     for (lane = 1; lane < group_size; ++lane)
2194                       zero_ops.quick_push (zero_ops[0]);
2195                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2196                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2197                     children.safe_insert (0, zero);
2198                   }
2199                 break;
2200               }
2201           for (unsigned i = 1; i < children.length (); ++i)
2202             {
2203               slp_tree op0 = children[i - 1];
2204               slp_tree op1 = children[i];
2205               bool this_two_op = false;
2206               for (unsigned lane = 0; lane < group_size; ++lane)
2207                 if (chains[lane][i].code != chains[0][i].code)
2208                   {
2209                     this_two_op = true;
2210                     break;
2211                   }
2212               slp_tree child;
2213               if (i == children.length () - 1)
2214                 child = vect_create_new_slp_node (node, stmts, 2);
2215               else
2216                 child = vect_create_new_slp_node (2, ERROR_MARK);
2217               if (this_two_op)
2218                 {
2219                   vec<std::pair<unsigned, unsigned> > lperm;
2220                   lperm.create (group_size);
2221                   for (unsigned lane = 0; lane < group_size; ++lane)
2222                     lperm.quick_push (std::make_pair
2223                       (chains[lane][i].code != chains[0][i].code, lane));
2224                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2225                                                      (chains[0][i].code == code
2226                                                       ? op_stmt_info
2227                                                       : other_op_stmt_info),
2228                                                      (chains[0][i].code == code
2229                                                       ? other_op_stmt_info
2230                                                       : op_stmt_info),
2231                                                      lperm);
2232                 }
2233               else
2234                 {
2235                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2236                   SLP_TREE_VECTYPE (child) = vectype;
2237                   SLP_TREE_LANES (child) = group_size;
2238                   SLP_TREE_CHILDREN (child).quick_push (op0);
2239                   SLP_TREE_CHILDREN (child).quick_push (op1);
2240                   SLP_TREE_REPRESENTATIVE (child)
2241                     = (chains[0][i].code == code
2242                        ? op_stmt_info : other_op_stmt_info);
2243                 }
2244               children[i] = child;
2245             }
2246           *tree_size += this_tree_size + 1;
2247           *max_nunits = this_max_nunits;
2248           while (!chains.is_empty ())
2249             chains.pop ().release ();
2250           return node;
2251         }
2252 out:
2253       while (!children.is_empty ())
2254         vect_free_slp_tree (children.pop ());
2255       while (!chains.is_empty ())
2256         chains.pop ().release ();
2257       /* Hard-fail, otherwise we might run into quadratic processing of the
2258          chains starting one stmt into the chain again.  */
2259       if (hard_fail)
2260         return NULL;
2261       /* Fall thru to normal processing.  */
2262     }
2263
2264   /* Get at the operands, verifying they are compatible.  */
2265   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2266   slp_oprnd_info oprnd_info;
2267   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2268     {
2269       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2270                                              stmts, i, &oprnds_info);
2271       if (res != 0)
2272         matches[(res == -1) ? 0 : i] = false;
2273       if (!matches[0])
2274         break;
2275     }
2276   for (i = 0; i < group_size; ++i)
2277     if (!matches[i])
2278       {
2279         vect_free_oprnd_info (oprnds_info);
2280         return NULL;
2281       }
2282   swap = NULL;
2283
2284   auto_vec<slp_tree, 4> children;
2285
2286   stmt_info = stmts[0];
2287
2288   /* Create SLP_TREE nodes for the definition node/s.  */
2289   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2290     {
2291       slp_tree child;
2292       unsigned int j;
2293
2294       /* We're skipping certain operands from processing, for example
2295          outer loop reduction initial defs.  */
2296       if (skip_args[i])
2297         {
2298           children.safe_push (NULL);
2299           continue;
2300         }
2301
2302       if (oprnd_info->first_dt == vect_uninitialized_def)
2303         {
2304           /* COND_EXPR have one too many eventually if the condition
2305              is a SSA name.  */
2306           gcc_assert (i == 3 && nops == 4);
2307           continue;
2308         }
2309
2310       if (is_a <bb_vec_info> (vinfo)
2311           && oprnd_info->first_dt == vect_internal_def
2312           && !oprnd_info->any_pattern)
2313         {
2314           /* For BB vectorization, if all defs are the same do not
2315              bother to continue the build along the single-lane
2316              graph but use a splat of the scalar value.  */
2317           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2318           for (j = 1; j < group_size; ++j)
2319             if (oprnd_info->def_stmts[j] != first_def)
2320               break;
2321           if (j == group_size
2322               /* But avoid doing this for loads where we may be
2323                  able to CSE things, unless the stmt is not
2324                  vectorizable.  */
2325               && (!STMT_VINFO_VECTORIZABLE (first_def)
2326                   || !gimple_vuse (first_def->stmt)))
2327             {
2328               if (dump_enabled_p ())
2329                 dump_printf_loc (MSG_NOTE, vect_location,
2330                                  "Using a splat of the uniform operand %G",
2331                                  first_def->stmt);
2332               oprnd_info->first_dt = vect_external_def;
2333             }
2334         }
2335
2336       if (oprnd_info->first_dt == vect_external_def
2337           || oprnd_info->first_dt == vect_constant_def)
2338         {
2339           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2340           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2341           oprnd_info->ops = vNULL;
2342           children.safe_push (invnode);
2343           continue;
2344         }
2345
2346       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2347                                         group_size, &this_max_nunits,
2348                                         matches, limit,
2349                                         &this_tree_size, bst_map)) != NULL)
2350         {
2351           oprnd_info->def_stmts = vNULL;
2352           children.safe_push (child);
2353           continue;
2354         }
2355
2356       /* If the SLP build for operand zero failed and operand zero
2357          and one can be commutated try that for the scalar stmts
2358          that failed the match.  */
2359       if (i == 0
2360           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2361           && matches[0]
2362           /* ???  For COND_EXPRs we can swap the comparison operands
2363              as well as the arms under some constraints.  */
2364           && nops == 2
2365           && oprnds_info[1]->first_dt == vect_internal_def
2366           && is_gimple_assign (stmt_info->stmt)
2367           /* Swapping operands for reductions breaks assumptions later on.  */
2368           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2369           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2370         {
2371           /* See whether we can swap the matching or the non-matching
2372              stmt operands.  */
2373           bool swap_not_matching = true;
2374           do
2375             {
2376               for (j = 0; j < group_size; ++j)
2377                 {
2378                   if (matches[j] != !swap_not_matching)
2379                     continue;
2380                   stmt_vec_info stmt_info = stmts[j];
2381                   /* Verify if we can swap operands of this stmt.  */
2382                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2383                   if (!stmt
2384                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2385                     {
2386                       if (!swap_not_matching)
2387                         goto fail;
2388                       swap_not_matching = false;
2389                       break;
2390                     }
2391                 }
2392             }
2393           while (j != group_size);
2394
2395           /* Swap mismatched definition stmts.  */
2396           if (dump_enabled_p ())
2397             dump_printf_loc (MSG_NOTE, vect_location,
2398                              "Re-trying with swapped operands of stmts ");
2399           for (j = 0; j < group_size; ++j)
2400             if (matches[j] == !swap_not_matching)
2401               {
2402                 std::swap (oprnds_info[0]->def_stmts[j],
2403                            oprnds_info[1]->def_stmts[j]);
2404                 std::swap (oprnds_info[0]->ops[j],
2405                            oprnds_info[1]->ops[j]);
2406                 if (dump_enabled_p ())
2407                   dump_printf (MSG_NOTE, "%d ", j);
2408               }
2409           if (dump_enabled_p ())
2410             dump_printf (MSG_NOTE, "\n");
2411           /* After swapping some operands we lost track whether an
2412              operand has any pattern defs so be conservative here.  */
2413           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2414             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2415           /* And try again with scratch 'matches' ... */
2416           bool *tem = XALLOCAVEC (bool, group_size);
2417           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2418                                             group_size, &this_max_nunits,
2419                                             tem, limit,
2420                                             &this_tree_size, bst_map)) != NULL)
2421             {
2422               oprnd_info->def_stmts = vNULL;
2423               children.safe_push (child);
2424               continue;
2425             }
2426         }
2427 fail:
2428
2429       /* If the SLP build failed and we analyze a basic-block
2430          simply treat nodes we fail to build as externally defined
2431          (and thus build vectors from the scalar defs).
2432          The cost model will reject outright expensive cases.
2433          ???  This doesn't treat cases where permutation ultimatively
2434          fails (or we don't try permutation below).  Ideally we'd
2435          even compute a permutation that will end up with the maximum
2436          SLP tree size...  */
2437       if (is_a <bb_vec_info> (vinfo)
2438           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2439              do extra work to cancel the pattern so the uses see the
2440              scalar version.  */
2441           && !is_pattern_stmt_p (stmt_info)
2442           && !oprnd_info->any_pattern)
2443         {
2444           /* But if there's a leading vector sized set of matching stmts
2445              fail here so we can split the group.  This matches the condition
2446              vect_analyze_slp_instance uses.  */
2447           /* ???  We might want to split here and combine the results to support
2448              multiple vector sizes better.  */
2449           for (j = 0; j < group_size; ++j)
2450             if (!matches[j])
2451               break;
2452           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2453             {
2454               if (dump_enabled_p ())
2455                 dump_printf_loc (MSG_NOTE, vect_location,
2456                                  "Building vector operands from scalars\n");
2457               this_tree_size++;
2458               child = vect_create_new_slp_node (oprnd_info->ops);
2459               children.safe_push (child);
2460               oprnd_info->ops = vNULL;
2461               continue;
2462             }
2463         }
2464
2465       gcc_assert (child == NULL);
2466       FOR_EACH_VEC_ELT (children, j, child)
2467         if (child)
2468           vect_free_slp_tree (child);
2469       vect_free_oprnd_info (oprnds_info);
2470       return NULL;
2471     }
2472
2473   vect_free_oprnd_info (oprnds_info);
2474
2475   /* If we have all children of a child built up from uniform scalars
2476      or does more than one possibly expensive vector construction then
2477      just throw that away, causing it built up from scalars.
2478      The exception is the SLP node for the vector store.  */
2479   if (is_a <bb_vec_info> (vinfo)
2480       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2481       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2482          do extra work to cancel the pattern so the uses see the
2483          scalar version.  */
2484       && !is_pattern_stmt_p (stmt_info))
2485     {
2486       slp_tree child;
2487       unsigned j;
2488       bool all_uniform_p = true;
2489       unsigned n_vector_builds = 0;
2490       FOR_EACH_VEC_ELT (children, j, child)
2491         {
2492           if (!child)
2493             ;
2494           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2495             all_uniform_p = false;
2496           else if (!vect_slp_tree_uniform_p (child))
2497             {
2498               all_uniform_p = false;
2499               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2500                 n_vector_builds++;
2501             }
2502         }
2503       if (all_uniform_p
2504           || n_vector_builds > 1
2505           || (n_vector_builds == children.length ()
2506               && is_a <gphi *> (stmt_info->stmt)))
2507         {
2508           /* Roll back.  */
2509           matches[0] = false;
2510           FOR_EACH_VEC_ELT (children, j, child)
2511             if (child)
2512               vect_free_slp_tree (child);
2513
2514           if (dump_enabled_p ())
2515             dump_printf_loc (MSG_NOTE, vect_location,
2516                              "Building parent vector operands from "
2517                              "scalars instead\n");
2518           return NULL;
2519         }
2520     }
2521
2522   *tree_size += this_tree_size + 1;
2523   *max_nunits = this_max_nunits;
2524
2525   if (two_operators)
2526     {
2527       /* ???  We'd likely want to either cache in bst_map sth like
2528          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2529          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2530          explicit stmts to put in so the keying on 'stmts' doesn't
2531          work (but we have the same issue with nodes that use 'ops').  */
2532       slp_tree one = new _slp_tree;
2533       slp_tree two = new _slp_tree;
2534       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2535       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2536       SLP_TREE_VECTYPE (one) = vectype;
2537       SLP_TREE_VECTYPE (two) = vectype;
2538       SLP_TREE_CHILDREN (one).safe_splice (children);
2539       SLP_TREE_CHILDREN (two).safe_splice (children);
2540       slp_tree child;
2541       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2542         SLP_TREE_REF_COUNT (child)++;
2543
2544       /* Here we record the original defs since this
2545          node represents the final lane configuration.  */
2546       node = vect_create_new_slp_node (node, stmts, 2);
2547       SLP_TREE_VECTYPE (node) = vectype;
2548       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2549       SLP_TREE_CHILDREN (node).quick_push (one);
2550       SLP_TREE_CHILDREN (node).quick_push (two);
2551       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2552       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2553       enum tree_code ocode = ERROR_MARK;
2554       stmt_vec_info ostmt_info;
2555       unsigned j = 0;
2556       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2557         {
2558           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2559           if (gimple_assign_rhs_code (ostmt) != code0)
2560             {
2561               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2562               ocode = gimple_assign_rhs_code (ostmt);
2563               j = i;
2564             }
2565           else
2566             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2567         }
2568       SLP_TREE_CODE (one) = code0;
2569       SLP_TREE_CODE (two) = ocode;
2570       SLP_TREE_LANES (one) = stmts.length ();
2571       SLP_TREE_LANES (two) = stmts.length ();
2572       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2573       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2574       return node;
2575     }
2576
2577   node = vect_create_new_slp_node (node, stmts, nops);
2578   SLP_TREE_VECTYPE (node) = vectype;
2579   SLP_TREE_CHILDREN (node).splice (children);
2580   return node;
2581 }
2582
2583 /* Dump a single SLP tree NODE.  */
2584
2585 static void
2586 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2587                      slp_tree node)
2588 {
2589   unsigned i, j;
2590   slp_tree child;
2591   stmt_vec_info stmt_info;
2592   tree op;
2593
2594   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2595   dump_user_location_t user_loc = loc.get_user_location ();
2596   dump_printf_loc (metadata, user_loc,
2597                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2598                    ", refcnt=%u)",
2599                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2600                    ? " (external)"
2601                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2602                       ? " (constant)"
2603                       : ""), (void *) node,
2604                    estimated_poly_value (node->max_nunits),
2605                                          SLP_TREE_REF_COUNT (node));
2606   if (SLP_TREE_VECTYPE (node))
2607     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2608   dump_printf (metadata, "\n");
2609   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2610     {
2611       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2612         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2613       else
2614         dump_printf_loc (metadata, user_loc, "op template: %G",
2615                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2616     }
2617   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2618     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2619       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2620   else
2621     {
2622       dump_printf_loc (metadata, user_loc, "\t{ ");
2623       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2624         dump_printf (metadata, "%T%s ", op,
2625                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2626       dump_printf (metadata, "}\n");
2627     }
2628   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2629     {
2630       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2631       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2632         dump_printf (dump_kind, " %u", j);
2633       dump_printf (dump_kind, " }\n");
2634     }
2635   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2636     {
2637       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2638       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2639         dump_printf (dump_kind, " %u[%u]",
2640                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2641                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2642       dump_printf (dump_kind, " }\n");
2643     }
2644   if (SLP_TREE_CHILDREN (node).is_empty ())
2645     return;
2646   dump_printf_loc (metadata, user_loc, "\tchildren");
2647   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2648     dump_printf (dump_kind, " %p", (void *)child);
2649   dump_printf (dump_kind, "\n");
2650 }
2651
2652 DEBUG_FUNCTION void
2653 debug (slp_tree node)
2654 {
2655   debug_dump_context ctx;
2656   vect_print_slp_tree (MSG_NOTE,
2657                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2658                        node);
2659 }
2660
2661 /* Recursive helper for the dot producer below.  */
2662
2663 static void
2664 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2665 {
2666   if (visited.add (node))
2667     return;
2668
2669   fprintf (f, "\"%p\" [label=\"", (void *)node);
2670   vect_print_slp_tree (MSG_NOTE,
2671                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2672                        node);
2673   fprintf (f, "\"];\n");
2674
2675
2676   for (slp_tree child : SLP_TREE_CHILDREN (node))
2677     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2678
2679   for (slp_tree child : SLP_TREE_CHILDREN (node))
2680     if (child)
2681       dot_slp_tree (f, child, visited);
2682 }
2683
2684 DEBUG_FUNCTION void
2685 dot_slp_tree (const char *fname, slp_tree node)
2686 {
2687   FILE *f = fopen (fname, "w");
2688   fprintf (f, "digraph {\n");
2689   fflush (f);
2690     {
2691       debug_dump_context ctx (f);
2692       hash_set<slp_tree> visited;
2693       dot_slp_tree (f, node, visited);
2694     }
2695   fflush (f);
2696   fprintf (f, "}\n");
2697   fclose (f);
2698 }
2699
2700 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2701
2702 static void
2703 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2704                       slp_tree node, hash_set<slp_tree> &visited)
2705 {
2706   unsigned i;
2707   slp_tree child;
2708
2709   if (visited.add (node))
2710     return;
2711
2712   vect_print_slp_tree (dump_kind, loc, node);
2713
2714   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2715     if (child)
2716       vect_print_slp_graph (dump_kind, loc, child, visited);
2717 }
2718
2719 static void
2720 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2721                       slp_tree entry)
2722 {
2723   hash_set<slp_tree> visited;
2724   vect_print_slp_graph (dump_kind, loc, entry, visited);
2725 }
2726
2727 /* Mark the tree rooted at NODE with PURE_SLP.  */
2728
2729 static void
2730 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2731 {
2732   int i;
2733   stmt_vec_info stmt_info;
2734   slp_tree child;
2735
2736   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2737     return;
2738
2739   if (visited.add (node))
2740     return;
2741
2742   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2743     STMT_SLP_TYPE (stmt_info) = pure_slp;
2744
2745   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2746     if (child)
2747       vect_mark_slp_stmts (child, visited);
2748 }
2749
2750 static void
2751 vect_mark_slp_stmts (slp_tree node)
2752 {
2753   hash_set<slp_tree> visited;
2754   vect_mark_slp_stmts (node, visited);
2755 }
2756
2757 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2758
2759 static void
2760 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2761 {
2762   int i;
2763   stmt_vec_info stmt_info;
2764   slp_tree child;
2765
2766   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2767     return;
2768
2769   if (visited.add (node))
2770     return;
2771
2772   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2773     {
2774       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2775                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2776       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2777     }
2778
2779   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2780     if (child)
2781       vect_mark_slp_stmts_relevant (child, visited);
2782 }
2783
2784 static void
2785 vect_mark_slp_stmts_relevant (slp_tree node)
2786 {
2787   hash_set<slp_tree> visited;
2788   vect_mark_slp_stmts_relevant (node, visited);
2789 }
2790
2791
2792 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2793
2794 static void
2795 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2796                        hash_set<slp_tree> &visited)
2797 {
2798   if (!node || visited.add (node))
2799     return;
2800
2801   if (SLP_TREE_CHILDREN (node).length () == 0)
2802     {
2803       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2804         return;
2805       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2806       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2808         loads.safe_push (node);
2809     }
2810   else
2811     {
2812       unsigned i;
2813       slp_tree child;
2814       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2815         vect_gather_slp_loads (loads, child, visited);
2816     }
2817 }
2818
2819
2820 /* Find the last store in SLP INSTANCE.  */
2821
2822 stmt_vec_info
2823 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2824 {
2825   stmt_vec_info last = NULL;
2826   stmt_vec_info stmt_vinfo;
2827
2828   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2829     {
2830       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2831       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2832     }
2833
2834   return last;
2835 }
2836
2837 /* Find the first stmt in NODE.  */
2838
2839 stmt_vec_info
2840 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2841 {
2842   stmt_vec_info first = NULL;
2843   stmt_vec_info stmt_vinfo;
2844
2845   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2846     {
2847       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2848       if (!first
2849           || get_later_stmt (stmt_vinfo, first) == first)
2850         first = stmt_vinfo;
2851     }
2852
2853   return first;
2854 }
2855
2856 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2857    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2858    (also containing the first GROUP1_SIZE stmts, since stores are
2859    consecutive), the second containing the remainder.
2860    Return the first stmt in the second group.  */
2861
2862 static stmt_vec_info
2863 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2864 {
2865   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2866   gcc_assert (group1_size > 0);
2867   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2868   gcc_assert (group2_size > 0);
2869   DR_GROUP_SIZE (first_vinfo) = group1_size;
2870
2871   stmt_vec_info stmt_info = first_vinfo;
2872   for (unsigned i = group1_size; i > 1; i--)
2873     {
2874       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2875       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2876     }
2877   /* STMT is now the last element of the first group.  */
2878   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2879   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2880
2881   DR_GROUP_SIZE (group2) = group2_size;
2882   for (stmt_info = group2; stmt_info;
2883        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2884     {
2885       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2886       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2887     }
2888
2889   /* For the second group, the DR_GROUP_GAP is that before the original group,
2890      plus skipping over the first vector.  */
2891   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2892
2893   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2894   DR_GROUP_GAP (first_vinfo) += group2_size;
2895
2896   if (dump_enabled_p ())
2897     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2898                      group1_size, group2_size);
2899
2900   return group2;
2901 }
2902
2903 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2904    statements and a vector of NUNITS elements.  */
2905
2906 static poly_uint64
2907 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2908 {
2909   return exact_div (common_multiple (nunits, group_size), group_size);
2910 }
2911
2912 /* Helper that checks to see if a node is a load node.  */
2913
2914 static inline bool
2915 vect_is_slp_load_node  (slp_tree root)
2916 {
2917   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2918          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2919          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2920 }
2921
2922
2923 /* Helper function of optimize_load_redistribution that performs the operation
2924    recursively.  */
2925
2926 static slp_tree
2927 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2928                                 vec_info *vinfo, unsigned int group_size,
2929                                 hash_map<slp_tree, slp_tree> *load_map,
2930                                 slp_tree root)
2931 {
2932   if (slp_tree *leader = load_map->get (root))
2933     return *leader;
2934
2935   slp_tree node;
2936   unsigned i;
2937
2938   /* For now, we don't know anything about externals so do not do anything.  */
2939   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2940     return NULL;
2941   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2942     {
2943       /* First convert this node into a load node and add it to the leaves
2944          list and flatten the permute from a lane to a load one.  If it's
2945          unneeded it will be elided later.  */
2946       vec<stmt_vec_info> stmts;
2947       stmts.create (SLP_TREE_LANES (root));
2948       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2949       for (unsigned j = 0; j < lane_perm.length (); j++)
2950         {
2951           std::pair<unsigned, unsigned> perm = lane_perm[j];
2952           node = SLP_TREE_CHILDREN (root)[perm.first];
2953
2954           if (!vect_is_slp_load_node (node)
2955               || SLP_TREE_CHILDREN (node).exists ())
2956             {
2957               stmts.release ();
2958               goto next;
2959             }
2960
2961           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2962         }
2963
2964       if (dump_enabled_p ())
2965         dump_printf_loc (MSG_NOTE, vect_location,
2966                          "converting stmts on permute node %p\n",
2967                          (void *) root);
2968
2969       bool *matches = XALLOCAVEC (bool, group_size);
2970       poly_uint64 max_nunits = 1;
2971       unsigned tree_size = 0, limit = 1;
2972       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2973                                   matches, &limit, &tree_size, bst_map);
2974       if (!node)
2975         stmts.release ();
2976
2977       load_map->put (root, node);
2978       return node;
2979     }
2980
2981 next:
2982   load_map->put (root, NULL);
2983
2984   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2985     {
2986       slp_tree value
2987         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2988                                           node);
2989       if (value)
2990         {
2991           SLP_TREE_REF_COUNT (value)++;
2992           SLP_TREE_CHILDREN (root)[i] = value;
2993           /* ???  We know the original leafs of the replaced nodes will
2994              be referenced by bst_map, only the permutes created by
2995              pattern matching are not.  */
2996           if (SLP_TREE_REF_COUNT (node) == 1)
2997             load_map->remove (node);
2998           vect_free_slp_tree (node);
2999         }
3000     }
3001
3002   return NULL;
3003 }
3004
3005 /* Temporary workaround for loads not being CSEd during SLP build.  This
3006    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3007    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3008    same DR such that the final operation is equal to a permuted load.  Such
3009    NODES are then directly converted into LOADS themselves.  The nodes are
3010    CSEd using BST_MAP.  */
3011
3012 static void
3013 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3014                               vec_info *vinfo, unsigned int group_size,
3015                               hash_map<slp_tree, slp_tree> *load_map,
3016                               slp_tree root)
3017 {
3018   slp_tree node;
3019   unsigned i;
3020
3021   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3022     {
3023       slp_tree value
3024         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3025                                           node);
3026       if (value)
3027         {
3028           SLP_TREE_REF_COUNT (value)++;
3029           SLP_TREE_CHILDREN (root)[i] = value;
3030           /* ???  We know the original leafs of the replaced nodes will
3031              be referenced by bst_map, only the permutes created by
3032              pattern matching are not.  */
3033           if (SLP_TREE_REF_COUNT (node) == 1)
3034             load_map->remove (node);
3035           vect_free_slp_tree (node);
3036         }
3037     }
3038 }
3039
3040 /* Helper function of vect_match_slp_patterns.
3041
3042    Attempts to match patterns against the slp tree rooted in REF_NODE using
3043    VINFO.  Patterns are matched in post-order traversal.
3044
3045    If matching is successful the value in REF_NODE is updated and returned, if
3046    not then it is returned unchanged.  */
3047
3048 static bool
3049 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3050                            slp_tree_to_load_perm_map_t *perm_cache,
3051                            slp_compat_nodes_map_t *compat_cache,
3052                            hash_set<slp_tree> *visited)
3053 {
3054   unsigned i;
3055   slp_tree node = *ref_node;
3056   bool found_p = false;
3057   if (!node || visited->add (node))
3058     return false;
3059
3060   slp_tree child;
3061   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3062     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3063                                           vinfo, perm_cache, compat_cache,
3064                                           visited);
3065
3066   for (unsigned x = 0; x < num__slp_patterns; x++)
3067     {
3068       vect_pattern *pattern
3069         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3070       if (pattern)
3071         {
3072           pattern->build (vinfo);
3073           delete pattern;
3074           found_p = true;
3075         }
3076     }
3077
3078   return found_p;
3079 }
3080
3081 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3082    vec_info VINFO.
3083
3084    The modified tree is returned.  Patterns are tried in order and multiple
3085    patterns may match.  */
3086
3087 static bool
3088 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3089                          hash_set<slp_tree> *visited,
3090                          slp_tree_to_load_perm_map_t *perm_cache,
3091                          slp_compat_nodes_map_t *compat_cache)
3092 {
3093   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3094   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3095
3096   if (dump_enabled_p ())
3097     dump_printf_loc (MSG_NOTE, vect_location,
3098                      "Analyzing SLP tree %p for patterns\n",
3099                      (void *) SLP_INSTANCE_TREE (instance));
3100
3101   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3102                                     visited);
3103 }
3104
3105 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3106    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3107    Return true if we could use IFN_STORE_LANES instead and if that appears
3108    to be the better approach.  */
3109
3110 static bool
3111 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3112                                unsigned int group_size,
3113                                unsigned int new_group_size)
3114 {
3115   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3116   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3117   if (!vectype)
3118     return false;
3119   /* Allow the split if one of the two new groups would operate on full
3120      vectors *within* rather than across one scalar loop iteration.
3121      This is purely a heuristic, but it should work well for group
3122      sizes of 3 and 4, where the possible splits are:
3123
3124        3->2+1:  OK if the vector has exactly two elements
3125        4->2+2:  Likewise
3126        4->3+1:  Less clear-cut.  */
3127   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3128       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3129     return false;
3130   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3131 }
3132
3133 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3134    vect_build_slp_tree to build a tree of packed stmts if possible.
3135    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3136
3137 static bool
3138 vect_analyze_slp_instance (vec_info *vinfo,
3139                            scalar_stmts_to_slp_tree_map_t *bst_map,
3140                            stmt_vec_info stmt_info, slp_instance_kind kind,
3141                            unsigned max_tree_size, unsigned *limit);
3142
3143 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3144    of KIND.  Return true if successful.  */
3145
3146 static bool
3147 vect_build_slp_instance (vec_info *vinfo,
3148                          slp_instance_kind kind,
3149                          vec<stmt_vec_info> &scalar_stmts,
3150                          vec<stmt_vec_info> &root_stmt_infos,
3151                          vec<tree> &remain,
3152                          unsigned max_tree_size, unsigned *limit,
3153                          scalar_stmts_to_slp_tree_map_t *bst_map,
3154                          /* ???  We need stmt_info for group splitting.  */
3155                          stmt_vec_info stmt_info_)
3156 {
3157   if (kind == slp_inst_kind_ctor)
3158     {
3159       if (dump_enabled_p ())
3160         dump_printf_loc (MSG_NOTE, vect_location,
3161                          "Analyzing vectorizable constructor: %G\n",
3162                          root_stmt_infos[0]->stmt);
3163     }
3164
3165   if (dump_enabled_p ())
3166     {
3167       dump_printf_loc (MSG_NOTE, vect_location,
3168                        "Starting SLP discovery for\n");
3169       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3170         dump_printf_loc (MSG_NOTE, vect_location,
3171                          "  %G", scalar_stmts[i]->stmt);
3172     }
3173
3174   /* When a BB reduction doesn't have an even number of lanes
3175      strip it down, treating the remaining lane as scalar.
3176      ???  Selecting the optimal set of lanes to vectorize would be nice
3177      but SLP build for all lanes will fail quickly because we think
3178      we're going to need unrolling.  */
3179   if (kind == slp_inst_kind_bb_reduc
3180       && (scalar_stmts.length () & 1))
3181     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3182
3183   /* Build the tree for the SLP instance.  */
3184   unsigned int group_size = scalar_stmts.length ();
3185   bool *matches = XALLOCAVEC (bool, group_size);
3186   poly_uint64 max_nunits = 1;
3187   unsigned tree_size = 0;
3188   unsigned i;
3189   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3190                                        &max_nunits, matches, limit,
3191                                        &tree_size, bst_map);
3192   if (node != NULL)
3193     {
3194       /* Calculate the unrolling factor based on the smallest type.  */
3195       poly_uint64 unrolling_factor
3196         = calculate_unrolling_factor (max_nunits, group_size);
3197
3198       if (maybe_ne (unrolling_factor, 1U)
3199           && is_a <bb_vec_info> (vinfo))
3200         {
3201           unsigned HOST_WIDE_INT const_max_nunits;
3202           if (!max_nunits.is_constant (&const_max_nunits)
3203               || const_max_nunits > group_size)
3204             {
3205               if (dump_enabled_p ())
3206                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3207                                  "Build SLP failed: store group "
3208                                  "size not a multiple of the vector size "
3209                                  "in basic block SLP\n");
3210               vect_free_slp_tree (node);
3211               return false;
3212             }
3213           /* Fatal mismatch.  */
3214           if (dump_enabled_p ())
3215             dump_printf_loc (MSG_NOTE, vect_location,
3216                              "SLP discovery succeeded but node needs "
3217                              "splitting\n");
3218           memset (matches, true, group_size);
3219           matches[group_size / const_max_nunits * const_max_nunits] = false;
3220           vect_free_slp_tree (node);
3221         }
3222       else
3223         {
3224           /* Create a new SLP instance.  */
3225           slp_instance new_instance = XNEW (class _slp_instance);
3226           SLP_INSTANCE_TREE (new_instance) = node;
3227           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3228           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3229           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3230           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3231           SLP_INSTANCE_KIND (new_instance) = kind;
3232           new_instance->reduc_phis = NULL;
3233           new_instance->cost_vec = vNULL;
3234           new_instance->subgraph_entries = vNULL;
3235
3236           if (dump_enabled_p ())
3237             dump_printf_loc (MSG_NOTE, vect_location,
3238                              "SLP size %u vs. limit %u.\n",
3239                              tree_size, max_tree_size);
3240
3241           /* Fixup SLP reduction chains.  */
3242           if (kind == slp_inst_kind_reduc_chain)
3243             {
3244               /* If this is a reduction chain with a conversion in front
3245                  amend the SLP tree with a node for that.  */
3246               gimple *scalar_def
3247                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3248               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3249                 {
3250                   /* Get at the conversion stmt - we know it's the single use
3251                      of the last stmt of the reduction chain.  */
3252                   use_operand_p use_p;
3253                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3254                                            &use_p, &scalar_def);
3255                   gcc_assert (r);
3256                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3257                   next_info = vect_stmt_to_vectorize (next_info);
3258                   scalar_stmts = vNULL;
3259                   scalar_stmts.create (group_size);
3260                   for (unsigned i = 0; i < group_size; ++i)
3261                     scalar_stmts.quick_push (next_info);
3262                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3263                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3264                   SLP_TREE_CHILDREN (conv).quick_push (node);
3265                   SLP_INSTANCE_TREE (new_instance) = conv;
3266                   /* We also have to fake this conversion stmt as SLP reduction
3267                      group so we don't have to mess with too much code
3268                      elsewhere.  */
3269                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3270                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3271                 }
3272               /* Fill the backedge child of the PHI SLP node.  The
3273                  general matching code cannot find it because the
3274                  scalar code does not reflect how we vectorize the
3275                  reduction.  */
3276               use_operand_p use_p;
3277               imm_use_iterator imm_iter;
3278               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3279               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3280                                      gimple_get_lhs (scalar_def))
3281                 /* There are exactly two non-debug uses, the reduction
3282                    PHI and the loop-closed PHI node.  */
3283                 if (!is_gimple_debug (USE_STMT (use_p))
3284                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3285                   {
3286                     auto_vec<stmt_vec_info, 64> phis (group_size);
3287                     stmt_vec_info phi_info
3288                       = vinfo->lookup_stmt (USE_STMT (use_p));
3289                     for (unsigned i = 0; i < group_size; ++i)
3290                       phis.quick_push (phi_info);
3291                     slp_tree *phi_node = bst_map->get (phis);
3292                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3293                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3294                       = SLP_INSTANCE_TREE (new_instance);
3295                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3296                   }
3297             }
3298
3299           vinfo->slp_instances.safe_push (new_instance);
3300
3301           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3302              the number of scalar stmts in the root in a few places.
3303              Verify that assumption holds.  */
3304           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3305                         .length () == group_size);
3306
3307           if (dump_enabled_p ())
3308             {
3309               dump_printf_loc (MSG_NOTE, vect_location,
3310                                "Final SLP tree for instance %p:\n",
3311                                (void *) new_instance);
3312               vect_print_slp_graph (MSG_NOTE, vect_location,
3313                                     SLP_INSTANCE_TREE (new_instance));
3314             }
3315
3316           return true;
3317         }
3318     }
3319   else
3320     {
3321       /* Failed to SLP.  */
3322       /* Free the allocated memory.  */
3323       scalar_stmts.release ();
3324     }
3325
3326   stmt_vec_info stmt_info = stmt_info_;
3327   /* Try to break the group up into pieces.  */
3328   if (kind == slp_inst_kind_store)
3329     {
3330       /* ???  We could delay all the actual splitting of store-groups
3331          until after SLP discovery of the original group completed.
3332          Then we can recurse to vect_build_slp_instance directly.  */
3333       for (i = 0; i < group_size; i++)
3334         if (!matches[i])
3335           break;
3336
3337       /* For basic block SLP, try to break the group up into multiples of
3338          a vector size.  */
3339       if (is_a <bb_vec_info> (vinfo)
3340           && (i > 1 && i < group_size))
3341         {
3342           tree scalar_type
3343             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3344           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3345                                                       1 << floor_log2 (i));
3346           unsigned HOST_WIDE_INT const_nunits;
3347           if (vectype
3348               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3349             {
3350               /* Split into two groups at the first vector boundary.  */
3351               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3352               unsigned group1_size = i & ~(const_nunits - 1);
3353
3354               if (dump_enabled_p ())
3355                 dump_printf_loc (MSG_NOTE, vect_location,
3356                                  "Splitting SLP group at stmt %u\n", i);
3357               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3358                                                                group1_size);
3359               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3360                                                     kind, max_tree_size,
3361                                                     limit);
3362               /* Split the rest at the failure point and possibly
3363                  re-analyze the remaining matching part if it has
3364                  at least two lanes.  */
3365               if (group1_size < i
3366                   && (i + 1 < group_size
3367                       || i - group1_size > 1))
3368                 {
3369                   stmt_vec_info rest2 = rest;
3370                   rest = vect_split_slp_store_group (rest, i - group1_size);
3371                   if (i - group1_size > 1)
3372                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3373                                                       kind, max_tree_size,
3374                                                       limit);
3375                 }
3376               /* Re-analyze the non-matching tail if it has at least
3377                  two lanes.  */
3378               if (i + 1 < group_size)
3379                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3380                                                   rest, kind, max_tree_size,
3381                                                   limit);
3382               return res;
3383             }
3384         }
3385
3386       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3387       if (is_a <loop_vec_info> (vinfo)
3388           && (i > 1 && i < group_size)
3389           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3390         {
3391           unsigned group1_size = i;
3392
3393           if (dump_enabled_p ())
3394             dump_printf_loc (MSG_NOTE, vect_location,
3395                              "Splitting SLP group at stmt %u\n", i);
3396
3397           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3398                                                            group1_size);
3399           /* Loop vectorization cannot handle gaps in stores, make sure
3400              the split group appears as strided.  */
3401           STMT_VINFO_STRIDED_P (rest) = 1;
3402           DR_GROUP_GAP (rest) = 0;
3403           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3404           DR_GROUP_GAP (stmt_info) = 0;
3405
3406           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3407                                                 kind, max_tree_size, limit);
3408           if (i + 1 < group_size)
3409             res |= vect_analyze_slp_instance (vinfo, bst_map,
3410                                               rest, kind, max_tree_size, limit);
3411
3412           return res;
3413         }
3414
3415       /* Even though the first vector did not all match, we might be able to SLP
3416          (some) of the remainder.  FORNOW ignore this possibility.  */
3417     }
3418
3419   /* Failed to SLP.  */
3420   if (dump_enabled_p ())
3421     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3422   return false;
3423 }
3424
3425
3426 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3427    vect_build_slp_tree to build a tree of packed stmts if possible.
3428    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3429
3430 static bool
3431 vect_analyze_slp_instance (vec_info *vinfo,
3432                            scalar_stmts_to_slp_tree_map_t *bst_map,
3433                            stmt_vec_info stmt_info,
3434                            slp_instance_kind kind,
3435                            unsigned max_tree_size, unsigned *limit)
3436 {
3437   unsigned int i;
3438   vec<stmt_vec_info> scalar_stmts;
3439
3440   if (is_a <bb_vec_info> (vinfo))
3441     vect_location = stmt_info->stmt;
3442
3443   stmt_vec_info next_info = stmt_info;
3444   if (kind == slp_inst_kind_store)
3445     {
3446       /* Collect the stores and store them in scalar_stmts.  */
3447       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3448       while (next_info)
3449         {
3450           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3451           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3452         }
3453     }
3454   else if (kind == slp_inst_kind_reduc_chain)
3455     {
3456       /* Collect the reduction stmts and store them in scalar_stmts.  */
3457       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3458       while (next_info)
3459         {
3460           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3461           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3462         }
3463       /* Mark the first element of the reduction chain as reduction to properly
3464          transform the node.  In the reduction analysis phase only the last
3465          element of the chain is marked as reduction.  */
3466       STMT_VINFO_DEF_TYPE (stmt_info)
3467         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3468       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3469         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3470     }
3471   else if (kind == slp_inst_kind_reduc_group)
3472     {
3473       /* Collect reduction statements.  */
3474       const vec<stmt_vec_info> &reductions
3475         = as_a <loop_vec_info> (vinfo)->reductions;
3476       scalar_stmts.create (reductions.length ());
3477       for (i = 0; reductions.iterate (i, &next_info); i++)
3478         if ((STMT_VINFO_RELEVANT_P (next_info)
3479              || STMT_VINFO_LIVE_P (next_info))
3480             /* ???  Make sure we didn't skip a conversion around a reduction
3481                path.  In that case we'd have to reverse engineer that conversion
3482                stmt following the chain using reduc_idx and from the PHI
3483                using reduc_def.  */
3484             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3485           scalar_stmts.quick_push (next_info);
3486       /* If less than two were relevant/live there's nothing to SLP.  */
3487       if (scalar_stmts.length () < 2)
3488         return false;
3489     }
3490   else
3491     gcc_unreachable ();
3492
3493   vec<stmt_vec_info> roots = vNULL;
3494   vec<tree> remain = vNULL;
3495   /* Build the tree for the SLP instance.  */
3496   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3497                                       roots, remain,
3498                                       max_tree_size, limit, bst_map,
3499                                       kind == slp_inst_kind_store
3500                                       ? stmt_info : NULL);
3501
3502   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3503      where we should do store group splitting.  */
3504
3505   return res;
3506 }
3507
3508 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3509    trees of packed scalar stmts if SLP is possible.  */
3510
3511 opt_result
3512 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3513 {
3514   unsigned int i;
3515   stmt_vec_info first_element;
3516   slp_instance instance;
3517
3518   DUMP_VECT_SCOPE ("vect_analyze_slp");
3519
3520   unsigned limit = max_tree_size;
3521
3522   scalar_stmts_to_slp_tree_map_t *bst_map
3523     = new scalar_stmts_to_slp_tree_map_t ();
3524
3525   /* Find SLP sequences starting from groups of grouped stores.  */
3526   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3527     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3528                                slp_inst_kind_store, max_tree_size, &limit);
3529
3530   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3531     {
3532       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3533         {
3534           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3535           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3536                                        bb_vinfo->roots[i].stmts,
3537                                        bb_vinfo->roots[i].roots,
3538                                        bb_vinfo->roots[i].remain,
3539                                        max_tree_size, &limit, bst_map, NULL))
3540             {
3541               bb_vinfo->roots[i].stmts = vNULL;
3542               bb_vinfo->roots[i].roots = vNULL;
3543               bb_vinfo->roots[i].remain = vNULL;
3544             }
3545         }
3546     }
3547
3548   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3549     {
3550       /* Find SLP sequences starting from reduction chains.  */
3551       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3552         if (! STMT_VINFO_RELEVANT_P (first_element)
3553             && ! STMT_VINFO_LIVE_P (first_element))
3554           ;
3555         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3556                                               slp_inst_kind_reduc_chain,
3557                                               max_tree_size, &limit))
3558           {
3559             /* Dissolve reduction chain group.  */
3560             stmt_vec_info vinfo = first_element;
3561             stmt_vec_info last = NULL;
3562             while (vinfo)
3563               {
3564                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3565                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3566                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3567                 last = vinfo;
3568                 vinfo = next;
3569               }
3570             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3571             /* It can be still vectorized as part of an SLP reduction.  */
3572             loop_vinfo->reductions.safe_push (last);
3573           }
3574
3575       /* Find SLP sequences starting from groups of reductions.  */
3576       if (loop_vinfo->reductions.length () > 1)
3577         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3578                                    slp_inst_kind_reduc_group, max_tree_size,
3579                                    &limit);
3580     }
3581
3582   hash_set<slp_tree> visited_patterns;
3583   slp_tree_to_load_perm_map_t perm_cache;
3584   slp_compat_nodes_map_t compat_cache;
3585
3586   /* See if any patterns can be found in the SLP tree.  */
3587   bool pattern_found = false;
3588   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3589     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3590                                               &visited_patterns, &perm_cache,
3591                                               &compat_cache);
3592
3593   /* If any were found optimize permutations of loads.  */
3594   if (pattern_found)
3595     {
3596       hash_map<slp_tree, slp_tree> load_map;
3597       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3598         {
3599           slp_tree root = SLP_INSTANCE_TREE (instance);
3600           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3601                                         &load_map, root);
3602         }
3603     }
3604
3605
3606
3607   /* The map keeps a reference on SLP nodes built, release that.  */
3608   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3609        it != bst_map->end (); ++it)
3610     if ((*it).second)
3611       vect_free_slp_tree ((*it).second);
3612   delete bst_map;
3613
3614   if (pattern_found && dump_enabled_p ())
3615     {
3616       dump_printf_loc (MSG_NOTE, vect_location,
3617                        "Pattern matched SLP tree\n");
3618       hash_set<slp_tree> visited;
3619       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3620         vect_print_slp_graph (MSG_NOTE, vect_location,
3621                               SLP_INSTANCE_TREE (instance), visited);
3622     }
3623
3624   return opt_result::success ();
3625 }
3626
3627 /* Estimates the cost of inserting layout changes into the SLP graph.
3628    It can also say that the insertion is impossible.  */
3629
3630 struct slpg_layout_cost
3631 {
3632   slpg_layout_cost () = default;
3633   slpg_layout_cost (sreal, bool);
3634
3635   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3636   bool is_possible () const { return depth != sreal::max (); }
3637
3638   bool operator== (const slpg_layout_cost &) const;
3639   bool operator!= (const slpg_layout_cost &) const;
3640
3641   bool is_better_than (const slpg_layout_cost &, bool) const;
3642
3643   void add_parallel_cost (const slpg_layout_cost &);
3644   void add_serial_cost (const slpg_layout_cost &);
3645   void split (unsigned int);
3646
3647   /* The longest sequence of layout changes needed during any traversal
3648      of the partition dag, weighted by execution frequency.
3649
3650      This is the most important metric when optimizing for speed, since
3651      it helps to ensure that we keep the number of operations on
3652      critical paths to a minimum.  */
3653   sreal depth = 0;
3654
3655   /* An estimate of the total number of operations needed.  It is weighted by
3656      execution frequency when optimizing for speed but not when optimizing for
3657      size.  In order to avoid double-counting, a node with a fanout of N will
3658      distribute 1/N of its total cost to each successor.
3659
3660      This is the most important metric when optimizing for size, since
3661      it helps to keep the total number of operations to a minimum,  */
3662   sreal total = 0;
3663 };
3664
3665 /* Construct costs for a node with weight WEIGHT.  A higher weight
3666    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3667    optimizing for size rather than speed.  */
3668
3669 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3670   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3671 {
3672 }
3673
3674 bool
3675 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3676 {
3677   return depth == other.depth && total == other.total;
3678 }
3679
3680 bool
3681 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3682 {
3683   return !operator== (other);
3684 }
3685
3686 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3687    true if we are optimizing for size rather than speed.  */
3688
3689 bool
3690 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3691                                   bool is_for_size) const
3692 {
3693   if (is_for_size)
3694     {
3695       if (total != other.total)
3696         return total < other.total;
3697       return depth < other.depth;
3698     }
3699   else
3700     {
3701       if (depth != other.depth)
3702         return depth < other.depth;
3703       return total < other.total;
3704     }
3705 }
3706
3707 /* Increase the costs to account for something with cost INPUT_COST
3708    happening in parallel with the current costs.  */
3709
3710 void
3711 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3712 {
3713   depth = std::max (depth, input_cost.depth);
3714   total += input_cost.total;
3715 }
3716
3717 /* Increase the costs to account for something with cost INPUT_COST
3718    happening in series with the current costs.  */
3719
3720 void
3721 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3722 {
3723   depth += other.depth;
3724   total += other.total;
3725 }
3726
3727 /* Split the total cost among TIMES successors or predecessors.  */
3728
3729 void
3730 slpg_layout_cost::split (unsigned int times)
3731 {
3732   if (times > 1)
3733     total /= times;
3734 }
3735
3736 /* Information about one node in the SLP graph, for use during
3737    vect_optimize_slp_pass.  */
3738
3739 struct slpg_vertex
3740 {
3741   slpg_vertex (slp_tree node_) : node (node_) {}
3742
3743   /* The node itself.  */
3744   slp_tree node;
3745
3746   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3747      partitions are flexible; they can have whichever layout consumers
3748      want them to have.  */
3749   int partition = -1;
3750
3751   /* The number of nodes that directly use the result of this one
3752      (i.e. the number of nodes that count this one as a child).  */
3753   unsigned int out_degree = 0;
3754
3755   /* The execution frequency of the node.  */
3756   sreal weight = 0;
3757
3758   /* The total execution frequency of all nodes that directly use the
3759      result of this one.  */
3760   sreal out_weight = 0;
3761 };
3762
3763 /* Information about one partition of the SLP graph, for use during
3764    vect_optimize_slp_pass.  */
3765
3766 struct slpg_partition_info
3767 {
3768   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3769      of m_partitioned_nodes.  */
3770   unsigned int node_begin = 0;
3771   unsigned int node_end = 0;
3772
3773   /* Which layout we've chosen to use for this partition, or -1 if
3774      we haven't picked one yet.  */
3775   int layout = -1;
3776
3777   /* The number of predecessors and successors in the partition dag.
3778      The predecessors always have lower partition numbers and the
3779      successors always have higher partition numbers.
3780
3781      Note that the directions of these edges are not necessarily the
3782      same as in the data flow graph.  For example, if an SCC has separate
3783      partitions for an inner loop and an outer loop, the inner loop's
3784      partition will have at least two incoming edges from the outer loop's
3785      partition: one for a live-in value and one for a live-out value.
3786      In data flow terms, one of these edges would also be from the outer loop
3787      to the inner loop, but the other would be in the opposite direction.  */
3788   unsigned int in_degree = 0;
3789   unsigned int out_degree = 0;
3790 };
3791
3792 /* Information about the costs of using a particular layout for a
3793    particular partition.  It can also say that the combination is
3794    impossible.  */
3795
3796 struct slpg_partition_layout_costs
3797 {
3798   bool is_possible () const { return internal_cost.is_possible (); }
3799   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3800
3801   /* The costs inherited from predecessor partitions.  */
3802   slpg_layout_cost in_cost;
3803
3804   /* The inherent cost of the layout within the node itself.  For example,
3805      this is nonzero for a load if choosing a particular layout would require
3806      the load to permute the loaded elements.  It is nonzero for a
3807      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3808      to full-vector moves.  */
3809   slpg_layout_cost internal_cost;
3810
3811   /* The costs inherited from successor partitions.  */
3812   slpg_layout_cost out_cost;
3813 };
3814
3815 /* This class tries to optimize the layout of vectors in order to avoid
3816    unnecessary shuffling.  At the moment, the set of possible layouts are
3817    restricted to bijective permutations.
3818
3819    The goal of the pass depends on whether we're optimizing for size or
3820    for speed.  When optimizing for size, the goal is to reduce the overall
3821    number of layout changes (including layout changes implied by things
3822    like load permutations).  When optimizing for speed, the goal is to
3823    reduce the maximum latency attributable to layout changes on any
3824    non-cyclical path through the data flow graph.
3825
3826    For example, when optimizing a loop nest for speed, we will prefer
3827    to make layout changes outside of a loop rather than inside of a loop,
3828    and will prefer to make layout changes in parallel rather than serially,
3829    even if that increases the overall number of layout changes.
3830
3831    The high-level procedure is:
3832
3833    (1) Build a graph in which edges go from uses (parents) to definitions
3834        (children).
3835
3836    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3837
3838    (3) When optimizing for speed, partition the nodes in each SCC based
3839        on their containing cfg loop.  When optimizing for size, treat
3840        each SCC as a single partition.
3841
3842        This gives us a dag of partitions.  The goal is now to assign a
3843        layout to each partition.
3844
3845    (4) Construct a set of vector layouts that are worth considering.
3846        Record which nodes must keep their current layout.
3847
3848    (5) Perform a forward walk over the partition dag (from loads to stores)
3849        accumulating the "forward" cost of using each layout.  When visiting
3850        each partition, assign a tentative choice of layout to the partition
3851        and use that choice when calculating the cost of using a different
3852        layout in successor partitions.
3853
3854    (6) Perform a backward walk over the partition dag (from stores to loads),
3855        accumulating the "backward" cost of using each layout.  When visiting
3856        each partition, make a final choice of layout for that partition based
3857        on the accumulated forward costs (from (5)) and backward costs
3858        (from (6)).
3859
3860    (7) Apply the chosen layouts to the SLP graph.
3861
3862    For example, consider the SLP statements:
3863
3864    S1:      a_1 = load
3865        loop:
3866    S2:      a_2 = PHI<a_1, a_3>
3867    S3:      b_1 = load
3868    S4:      a_3 = a_2 + b_1
3869        exit:
3870    S5:      a_4 = PHI<a_3>
3871    S6:      store a_4
3872
3873    S2 and S4 form an SCC and are part of the same loop.  Every other
3874    statement is in a singleton SCC.  In this example there is a one-to-one
3875    mapping between SCCs and partitions and the partition dag looks like this;
3876
3877         S1     S3
3878          \     /
3879           S2+S4
3880             |
3881            S5
3882             |
3883            S6
3884
3885    S2, S3 and S4 will have a higher execution frequency than the other
3886    statements, so when optimizing for speed, the goal is to avoid any
3887    layout changes:
3888
3889    - within S3
3890    - within S2+S4
3891    - on the S3->S2+S4 edge
3892
3893    For example, if S3 was originally a reversing load, the goal of the
3894    pass is to make it an unreversed load and change the layout on the
3895    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3896    on S1->S2+S4 and S5->S6 would also be acceptable.)
3897
3898    The difference between SCCs and partitions becomes important if we
3899    add an outer loop:
3900
3901    S1:      a_1 = ...
3902        loop1:
3903    S2:      a_2 = PHI<a_1, a_6>
3904    S3:      b_1 = load
3905    S4:      a_3 = a_2 + b_1
3906        loop2:
3907    S5:      a_4 = PHI<a_3, a_5>
3908    S6:      c_1 = load
3909    S7:      a_5 = a_4 + c_1
3910        exit2:
3911    S8:      a_6 = PHI<a_5>
3912    S9:      store a_6
3913        exit1:
3914
3915    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3916    for speed, we usually do not want restrictions in the outer loop to "infect"
3917    the decision for the inner loop.  For example, if an outer-loop node
3918    in the SCC contains a statement with a fixed layout, that should not
3919    prevent the inner loop from using a different layout.  Conversely,
3920    the inner loop should not dictate a layout to the outer loop: if the
3921    outer loop does a lot of computation, then it may not be efficient to
3922    do all of that computation in the inner loop's preferred layout.
3923
3924    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3925    and S5+S7 (inner).  We also try to arrange partitions so that:
3926
3927    - the partition for an outer loop comes before the partition for
3928      an inner loop
3929
3930    - if a sibling loop A dominates a sibling loop B, A's partition
3931      comes before B's
3932
3933    This gives the following partition dag for the example above:
3934
3935         S1        S3
3936          \        /
3937           S2+S4+S8   S6
3938            |   \\    /
3939            |    S5+S7
3940            |
3941           S9
3942
3943    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3944    one for a reversal of the edge S7->S8.
3945
3946    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3947    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3948    preferred layout against the cost of changing the layout on entry to the
3949    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3950
3951    Although this works well when optimizing for speed, it has the downside
3952    when optimizing for size that the choice of layout for S5+S7 is completely
3953    independent of S9, which lessens the chance of reducing the overall number
3954    of permutations.  We therefore do not partition SCCs when optimizing
3955    for size.
3956
3957    To give a concrete example of the difference between optimizing
3958    for size and speed, consider:
3959
3960    a[0] = (b[1] << c[3]) - d[1];
3961    a[1] = (b[0] << c[2]) - d[0];
3962    a[2] = (b[3] << c[1]) - d[3];
3963    a[3] = (b[2] << c[0]) - d[2];
3964
3965    There are three different layouts here: one for a, one for b and d,
3966    and one for c.  When optimizing for speed it is better to permute each
3967    of b, c and d into the order required by a, since those permutations
3968    happen in parallel.  But when optimizing for size, it is better to:
3969
3970    - permute c into the same order as b
3971    - do the arithmetic
3972    - permute the result into the order required by a
3973
3974    This gives 2 permutations rather than 3.  */
3975
3976 class vect_optimize_slp_pass
3977 {
3978 public:
3979   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3980   void run ();
3981
3982 private:
3983   /* Graph building.  */
3984   struct loop *containing_loop (slp_tree);
3985   bool is_cfg_latch_edge (graph_edge *);
3986   void build_vertices (hash_set<slp_tree> &, slp_tree);
3987   void build_vertices ();
3988   void build_graph ();
3989
3990   /* Partitioning.  */
3991   void create_partitions ();
3992   template<typename T> void for_each_partition_edge (unsigned int, T);
3993
3994   /* Layout selection.  */
3995   bool is_compatible_layout (slp_tree, unsigned int);
3996   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3997   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3998                                                        unsigned int);
3999   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4000                                int, unsigned int);
4001   int internal_node_cost (slp_tree, int, unsigned int);
4002   void start_choosing_layouts ();
4003
4004   /* Cost propagation.  */
4005   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4006                                      unsigned int, unsigned int);
4007   slpg_layout_cost total_in_cost (unsigned int);
4008   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4009   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4010   void forward_pass ();
4011   void backward_pass ();
4012
4013   /* Rematerialization.  */
4014   slp_tree get_result_with_layout (slp_tree, unsigned int);
4015   void materialize ();
4016
4017   /* Clean-up.  */
4018   void remove_redundant_permutations ();
4019
4020   void dump ();
4021
4022   vec_info *m_vinfo;
4023
4024   /* True if we should optimize the graph for size, false if we should
4025      optimize it for speed.  (It wouldn't be easy to make this decision
4026      more locally.)  */
4027   bool m_optimize_size;
4028
4029   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4030      In other words, a node's predecessors are its slp_tree parents and
4031      a node's successors are its slp_tree children.  */
4032   graph *m_slpg = nullptr;
4033
4034   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4035   auto_vec<slpg_vertex> m_vertices;
4036
4037   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4038      and loads.  */
4039   auto_vec<int> m_leafs;
4040
4041   /* This array has one entry for every vector layout that we're considering.
4042      Element 0 is null and indicates "no change".  Other entries describe
4043      permutations that are inherent in the current graph and that we would
4044      like to reverse if possible.
4045
4046      For example, a permutation { 1, 2, 3, 0 } means that something has
4047      effectively been permuted in that way, such as a load group
4048      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4049      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4050      in order to put things "back" in order.  */
4051   auto_vec<vec<unsigned> > m_perms;
4052
4053   /* A partitioning of the nodes for which a layout must be chosen.
4054      Each partition represents an <SCC, cfg loop> pair; that is,
4055      nodes in different SCCs belong to different partitions, and nodes
4056      within an SCC can be further partitioned according to a containing
4057      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4058
4059      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4060        from leaves (such as loads) to roots (such as stores).
4061
4062      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4063   auto_vec<slpg_partition_info> m_partitions;
4064
4065   /* The list of all nodes for which a layout must be chosen.  Nodes for
4066      partition P come before the nodes for partition P+1.  Nodes within a
4067      partition are in reverse postorder.  */
4068   auto_vec<unsigned int> m_partitioned_nodes;
4069
4070   /* Index P * num-layouts + L contains the cost of using layout L
4071      for partition P.  */
4072   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4073
4074   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4075      original output of node N adjusted to have layout L.  */
4076   auto_vec<slp_tree> m_node_layouts;
4077 };
4078
4079 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4080    Also record whether we should optimize anything for speed rather
4081    than size.  */
4082
4083 void
4084 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4085                                         slp_tree node)
4086 {
4087   unsigned i;
4088   slp_tree child;
4089
4090   if (visited.add (node))
4091     return;
4092
4093   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4094     {
4095       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4096       if (optimize_bb_for_speed_p (bb))
4097         m_optimize_size = false;
4098     }
4099
4100   node->vertex = m_vertices.length ();
4101   m_vertices.safe_push (slpg_vertex (node));
4102
4103   bool leaf = true;
4104   bool force_leaf = false;
4105   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4106     if (child)
4107       {
4108         leaf = false;
4109         build_vertices (visited, child);
4110       }
4111     else
4112       force_leaf = true;
4113   /* Since SLP discovery works along use-def edges all cycles have an
4114      entry - but there's the exception of cycles where we do not handle
4115      the entry explicitely (but with a NULL SLP node), like some reductions
4116      and inductions.  Force those SLP PHIs to act as leafs to make them
4117      backwards reachable.  */
4118   if (leaf || force_leaf)
4119     m_leafs.safe_push (node->vertex);
4120 }
4121
4122 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4123
4124 void
4125 vect_optimize_slp_pass::build_vertices ()
4126 {
4127   hash_set<slp_tree> visited;
4128   unsigned i;
4129   slp_instance instance;
4130   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4131     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4132 }
4133
4134 /* Apply (reverse) bijectite PERM to VEC.  */
4135
4136 template <class T>
4137 static void
4138 vect_slp_permute (vec<unsigned> perm,
4139                   vec<T> &vec, bool reverse)
4140 {
4141   auto_vec<T, 64> saved;
4142   saved.create (vec.length ());
4143   for (unsigned i = 0; i < vec.length (); ++i)
4144     saved.quick_push (vec[i]);
4145
4146   if (reverse)
4147     {
4148       for (unsigned i = 0; i < vec.length (); ++i)
4149         vec[perm[i]] = saved[i];
4150       for (unsigned i = 0; i < vec.length (); ++i)
4151         gcc_assert (vec[perm[i]] == saved[i]);
4152     }
4153   else
4154     {
4155       for (unsigned i = 0; i < vec.length (); ++i)
4156         vec[i] = saved[perm[i]];
4157       for (unsigned i = 0; i < vec.length (); ++i)
4158         gcc_assert (vec[i] == saved[perm[i]]);
4159     }
4160 }
4161
4162 /* Return the cfg loop that contains NODE.  */
4163
4164 struct loop *
4165 vect_optimize_slp_pass::containing_loop (slp_tree node)
4166 {
4167   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4168   if (!rep)
4169     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4170   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4171 }
4172
4173 /* Return true if UD (an edge from a use to a definition) is associated
4174    with a loop latch edge in the cfg.  */
4175
4176 bool
4177 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4178 {
4179   slp_tree use = m_vertices[ud->src].node;
4180   slp_tree def = m_vertices[ud->dest].node;
4181   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4182       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4183     return false;
4184
4185   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4186   return (is_a<gphi *> (use_rep->stmt)
4187           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4188           && containing_loop (def) == containing_loop (use));
4189 }
4190
4191 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4192    a nonnull data field.  */
4193
4194 void
4195 vect_optimize_slp_pass::build_graph ()
4196 {
4197   m_optimize_size = true;
4198   build_vertices ();
4199
4200   m_slpg = new_graph (m_vertices.length ());
4201   for (slpg_vertex &v : m_vertices)
4202     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4203       if (child)
4204         {
4205           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4206           if (is_cfg_latch_edge (ud))
4207             ud->data = this;
4208         }
4209 }
4210
4211 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4212
4213 static bool
4214 skip_cfg_latch_edges (graph_edge *e)
4215 {
4216   return e->data;
4217 }
4218
4219 /* Create the node partitions.  */
4220
4221 void
4222 vect_optimize_slp_pass::create_partitions ()
4223 {
4224   /* Calculate a postorder of the graph, ignoring edges that correspond
4225      to natural latch edges in the cfg.  Reading the vector from the end
4226      to the beginning gives the reverse postorder.  */
4227   auto_vec<int> initial_rpo;
4228   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4229                false, NULL, skip_cfg_latch_edges);
4230   gcc_assert (initial_rpo.length () == m_vertices.length ());
4231
4232   /* Calculate the strongly connected components of the graph.  */
4233   auto_vec<int> scc_grouping;
4234   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4235
4236   /* Create a new index order in which all nodes from the same SCC are
4237      consecutive.  Use scc_pos to record the index of the first node in
4238      each SCC.  */
4239   auto_vec<unsigned int> scc_pos (num_sccs);
4240   int last_component = -1;
4241   unsigned int node_count = 0;
4242   for (unsigned int node_i : scc_grouping)
4243     {
4244       if (last_component != m_slpg->vertices[node_i].component)
4245         {
4246           last_component = m_slpg->vertices[node_i].component;
4247           gcc_assert (last_component == int (scc_pos.length ()));
4248           scc_pos.quick_push (node_count);
4249         }
4250       node_count += 1;
4251     }
4252   gcc_assert (node_count == initial_rpo.length ()
4253               && last_component + 1 == int (num_sccs));
4254
4255   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4256      inside each SCC following the RPO we calculated above.  The fact that
4257      we ignored natural latch edges when calculating the RPO should ensure
4258      that, for natural loop nests:
4259
4260      - the first node that we encounter in a cfg loop is the loop header phi
4261      - the loop header phis are in dominance order
4262
4263      Arranging for this is an optimization (see below) rather than a
4264      correctness issue.  Unnatural loops with a tangled mess of backedges
4265      will still work correctly, but might give poorer results.
4266
4267      Also update scc_pos so that it gives 1 + the index of the last node
4268      in the SCC.  */
4269   m_partitioned_nodes.safe_grow (node_count);
4270   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4271     {
4272       unsigned int node_i = initial_rpo[old_i];
4273       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4274       m_partitioned_nodes[new_i] = node_i;
4275     }
4276
4277   /* When optimizing for speed, partition each SCC based on the containing
4278      cfg loop. The order we constructed above should ensure that, for natural
4279      cfg loops, we'll create sub-SCC partitions for outer loops before
4280      the corresponding sub-SCC partitions for inner loops.  Similarly,
4281      when one sibling loop A dominates another sibling loop B, we should
4282      create a sub-SCC partition for A before a sub-SCC partition for B.
4283
4284      As above, nothing depends for correctness on whether this achieves
4285      a natural nesting, but we should get better results when it does.  */
4286   m_partitions.reserve (m_vertices.length ());
4287   unsigned int next_partition_i = 0;
4288   hash_map<struct loop *, int> loop_partitions;
4289   unsigned int rpo_begin = 0;
4290   unsigned int num_partitioned_nodes = 0;
4291   for (unsigned int rpo_end : scc_pos)
4292     {
4293       loop_partitions.empty ();
4294       unsigned int partition_i = next_partition_i;
4295       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4296         {
4297           /* Handle externals and constants optimistically throughout.
4298              But treat existing vectors as fixed since we do not handle
4299              permuting them.  */
4300           unsigned int node_i = m_partitioned_nodes[rpo_i];
4301           auto &vertex = m_vertices[node_i];
4302           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4303                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4304               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4305             vertex.partition = -1;
4306           else
4307             {
4308               bool existed;
4309               if (m_optimize_size)
4310                 existed = next_partition_i > partition_i;
4311               else
4312                 {
4313                   struct loop *loop = containing_loop (vertex.node);
4314                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4315                   if (!existed)
4316                     entry = next_partition_i;
4317                   partition_i = entry;
4318                 }
4319               if (!existed)
4320                 {
4321                   m_partitions.quick_push (slpg_partition_info ());
4322                   next_partition_i += 1;
4323                 }
4324               vertex.partition = partition_i;
4325               num_partitioned_nodes += 1;
4326               m_partitions[partition_i].node_end += 1;
4327             }
4328         }
4329       rpo_begin = rpo_end;
4330     }
4331
4332   /* Assign ranges of consecutive node indices to each partition,
4333      in partition order.  Start with node_end being the same as
4334      node_begin so that the next loop can use it as a counter.  */
4335   unsigned int node_begin = 0;
4336   for (auto &partition : m_partitions)
4337     {
4338       partition.node_begin = node_begin;
4339       node_begin += partition.node_end;
4340       partition.node_end = partition.node_begin;
4341     }
4342   gcc_assert (node_begin == num_partitioned_nodes);
4343
4344   /* Finally build the list of nodes in partition order.  */
4345   m_partitioned_nodes.truncate (num_partitioned_nodes);
4346   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4347     {
4348       int partition_i = m_vertices[node_i].partition;
4349       if (partition_i >= 0)
4350         {
4351           unsigned int order_i = m_partitions[partition_i].node_end++;
4352           m_partitioned_nodes[order_i] = node_i;
4353         }
4354     }
4355 }
4356
4357 /* Look for edges from earlier partitions into node NODE_I and edges from
4358    node NODE_I into later partitions.  Call:
4359
4360       FN (ud, other_node_i)
4361
4362    for each such use-to-def edge ud, where other_node_i is the node at the
4363    other end of the edge.  */
4364
4365 template<typename T>
4366 void
4367 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4368 {
4369   int partition_i = m_vertices[node_i].partition;
4370   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4371        pred; pred = pred->pred_next)
4372     {
4373       int src_partition_i = m_vertices[pred->src].partition;
4374       if (src_partition_i >= 0 && src_partition_i != partition_i)
4375         fn (pred, pred->src);
4376     }
4377   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4378        succ; succ = succ->succ_next)
4379     {
4380       int dest_partition_i = m_vertices[succ->dest].partition;
4381       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4382         fn (succ, succ->dest);
4383     }
4384 }
4385
4386 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4387    that NODE would operate on.  This test is independent of NODE's actual
4388    operation.  */
4389
4390 bool
4391 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4392                                               unsigned int layout_i)
4393 {
4394   if (layout_i == 0)
4395     return true;
4396
4397   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4398     return false;
4399
4400   return true;
4401 }
4402
4403 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4404    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4405    layouts is incompatible with NODE or if the change is not possible for
4406    some other reason.
4407
4408    The properties taken from NODE include the number of lanes and the
4409    vector type.  The actual operation doesn't matter.  */
4410
4411 int
4412 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4413                                             unsigned int from_layout_i,
4414                                             unsigned int to_layout_i)
4415 {
4416   if (!is_compatible_layout (node, from_layout_i)
4417       || !is_compatible_layout (node, to_layout_i))
4418     return -1;
4419
4420   if (from_layout_i == to_layout_i)
4421     return 0;
4422
4423   auto_vec<slp_tree, 1> children (1);
4424   children.quick_push (node);
4425   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4426   if (from_layout_i > 0)
4427     for (unsigned int i : m_perms[from_layout_i])
4428       perm.quick_push ({ 0, i });
4429   else
4430     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4431       perm.quick_push ({ 0, i });
4432   if (to_layout_i > 0)
4433     vect_slp_permute (m_perms[to_layout_i], perm, true);
4434   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4435                                                children, false);
4436   if (count >= 0)
4437     return MAX (count, 1);
4438
4439   /* ??? In principle we could try changing via layout 0, giving two
4440      layout changes rather than 1.  Doing that would require
4441      corresponding support in get_result_with_layout.  */
4442   return -1;
4443 }
4444
4445 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4446
4447 inline slpg_partition_layout_costs &
4448 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4449                                                 unsigned int layout_i)
4450 {
4451   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4452 }
4453
4454 /* Change PERM in one of two ways:
4455
4456    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4457      chosen for child I of NODE.
4458
4459    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4460
4461    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4462
4463 void
4464 vect_optimize_slp_pass::
4465 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4466                         int in_layout_i, unsigned int out_layout_i)
4467 {
4468   for (auto &entry : perm)
4469     {
4470       int this_in_layout_i = in_layout_i;
4471       if (this_in_layout_i < 0)
4472         {
4473           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4474           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4475           this_in_layout_i = m_partitions[in_partition_i].layout;
4476         }
4477       if (this_in_layout_i > 0)
4478         entry.second = m_perms[this_in_layout_i][entry.second];
4479     }
4480   if (out_layout_i > 0)
4481     vect_slp_permute (m_perms[out_layout_i], perm, true);
4482 }
4483
4484 /* Check whether the target allows NODE to be rearranged so that the node's
4485    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4486    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4487
4488    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4489    NODE can adapt to the layout changes that have (perhaps provisionally)
4490    been chosen for NODE's children, so that no extra permutations are
4491    needed on either the input or the output of NODE.
4492
4493    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4494    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4495
4496    IN_LAYOUT_I has no meaning for other types of node.
4497
4498    Keeping the node as-is is always valid.  If the target doesn't appear
4499    to support the node as-is, but might realistically support other layouts,
4500    then layout 0 instead has the cost of a worst-case permutation.  On the
4501    one hand, this ensures that every node has at least one valid layout,
4502    avoiding what would otherwise be an awkward special case.  On the other,
4503    it still encourages the pass to change an invalid pre-existing layout
4504    choice into a valid one.  */
4505
4506 int
4507 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4508                                             unsigned int out_layout_i)
4509 {
4510   const int fallback_cost = 1;
4511
4512   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4513     {
4514       auto_lane_permutation_t tmp_perm;
4515       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4516
4517       /* Check that the child nodes support the chosen layout.  Checking
4518          the first child is enough, since any second child would have the
4519          same shape.  */
4520       auto first_child = SLP_TREE_CHILDREN (node)[0];
4521       if (in_layout_i > 0
4522           && !is_compatible_layout (first_child, in_layout_i))
4523         return -1;
4524
4525       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4526       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4527                                                   node, tmp_perm,
4528                                                   SLP_TREE_CHILDREN (node),
4529                                                   false);
4530       if (count < 0)
4531         {
4532           if (in_layout_i == 0 && out_layout_i == 0)
4533             {
4534               /* Use the fallback cost if the node could in principle support
4535                  some nonzero layout for both the inputs and the outputs.
4536                  Otherwise assume that the node will be rejected later
4537                  and rebuilt from scalars.  */
4538               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4539                 return fallback_cost;
4540               return 0;
4541             }
4542           return -1;
4543         }
4544
4545       /* We currently have no way of telling whether the new layout is cheaper
4546          or more expensive than the old one.  But at least in principle,
4547          it should be worth making zero permutations (whole-vector shuffles)
4548          cheaper than real permutations, in case the pass is able to remove
4549          the latter.  */
4550       return count == 0 ? 0 : 1;
4551     }
4552
4553   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4554   if (rep
4555       && STMT_VINFO_DATA_REF (rep)
4556       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4557       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4558     {
4559       auto_load_permutation_t tmp_perm;
4560       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4561       if (out_layout_i > 0)
4562         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4563
4564       poly_uint64 vf = 1;
4565       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4566         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4567       unsigned int n_perms;
4568       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4569                                            nullptr, vf, true, false, &n_perms))
4570         {
4571           auto rep = SLP_TREE_REPRESENTATIVE (node);
4572           if (out_layout_i == 0)
4573             {
4574               /* Use the fallback cost if the load is an N-to-N permutation.
4575                  Otherwise assume that the node will be rejected later
4576                  and rebuilt from scalars.  */
4577               if (STMT_VINFO_GROUPED_ACCESS (rep)
4578                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4579                       == SLP_TREE_LANES (node)))
4580                 return fallback_cost;
4581               return 0;
4582             }
4583           return -1;
4584         }
4585
4586       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4587       return n_perms == 0 ? 0 : 1;
4588     }
4589
4590   return 0;
4591 }
4592
4593 /* Decide which element layouts we should consider using.  Calculate the
4594    weights associated with inserting layout changes on partition edges.
4595    Also mark partitions that cannot change layout, by setting their
4596    layout to zero.  */
4597
4598 void
4599 vect_optimize_slp_pass::start_choosing_layouts ()
4600 {
4601   /* Used to assign unique permutation indices.  */
4602   using perm_hash = unbounded_hashmap_traits<
4603     vec_free_hash_base<int_hash_base<unsigned>>,
4604     int_hash<int, -1, -2>
4605   >;
4606   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4607
4608   /* Layout 0 is "no change".  */
4609   m_perms.safe_push (vNULL);
4610
4611   /* Create layouts from existing permutations.  */
4612   auto_load_permutation_t tmp_perm;
4613   for (unsigned int node_i : m_partitioned_nodes)
4614     {
4615       /* Leafs also double as entries to the reverse graph.  Allow the
4616          layout of those to be changed.  */
4617       auto &vertex = m_vertices[node_i];
4618       auto &partition = m_partitions[vertex.partition];
4619       if (!m_slpg->vertices[node_i].succ)
4620         partition.layout = 0;
4621
4622       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4623       slp_tree node = vertex.node;
4624       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4625       slp_tree child;
4626       unsigned HOST_WIDE_INT imin, imax = 0;
4627       bool any_permute = false;
4628       tmp_perm.truncate (0);
4629       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4630         {
4631           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4632              unpermuted, record a layout that reverses this permutation.
4633
4634              We would need more work to cope with loads that are internally
4635              permuted and also have inputs (such as masks for
4636              IFN_MASK_LOADs).  */
4637           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4638           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4639             {
4640               partition.layout = -1;
4641               continue;
4642             }
4643           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4644           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4645           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4646         }
4647       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4648                && SLP_TREE_CHILDREN (node).length () == 1
4649                && (child = SLP_TREE_CHILDREN (node)[0])
4650                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4651                    .is_constant (&imin)))
4652         {
4653           /* If the child has the same vector size as this node,
4654              reversing the permutation can make the permutation a no-op.
4655              In other cases it can change a true permutation into a
4656              full-vector extract.  */
4657           tmp_perm.reserve (SLP_TREE_LANES (node));
4658           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4659             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4660         }
4661       else
4662         continue;
4663
4664       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4665         {
4666           unsigned idx = tmp_perm[j];
4667           imin = MIN (imin, idx);
4668           imax = MAX (imax, idx);
4669           if (idx - tmp_perm[0] != j)
4670             any_permute = true;
4671         }
4672       /* If the span doesn't match we'd disrupt VF computation, avoid
4673          that for now.  */
4674       if (imax - imin + 1 != SLP_TREE_LANES (node))
4675         continue;
4676       /* If there's no permute no need to split one out.  In this case
4677          we can consider turning a load into a permuted load, if that
4678          turns out to be cheaper than alternatives.  */
4679       if (!any_permute)
4680         {
4681           partition.layout = -1;
4682           continue;
4683         }
4684
4685       /* For now only handle true permutes, like
4686          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4687          when permuting constants and invariants keeping the permute
4688          bijective.  */
4689       auto_sbitmap load_index (SLP_TREE_LANES (node));
4690       bitmap_clear (load_index);
4691       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4692         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4693       unsigned j;
4694       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4695         if (!bitmap_bit_p (load_index, j))
4696           break;
4697       if (j != SLP_TREE_LANES (node))
4698         continue;
4699
4700       vec<unsigned> perm = vNULL;
4701       perm.safe_grow (SLP_TREE_LANES (node), true);
4702       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4703         perm[j] = tmp_perm[j] - imin;
4704
4705       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4706         {
4707           /* Continue to use existing layouts, but don't add any more.  */
4708           int *entry = layout_ids.get (perm);
4709           partition.layout = entry ? *entry : 0;
4710           perm.release ();
4711         }
4712       else
4713         {
4714           bool existed;
4715           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4716           if (existed)
4717             perm.release ();
4718           else
4719             {
4720               layout_i = m_perms.length ();
4721               m_perms.safe_push (perm);
4722             }
4723           partition.layout = layout_i;
4724         }
4725     }
4726
4727   /* Initially assume that every layout is possible and has zero cost
4728      in every partition.  */
4729   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4730                                               * m_perms.length ());
4731
4732   /* We have to mark outgoing permutations facing non-associating-reduction
4733      graph entries that are not represented as to be materialized.
4734      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4735   for (slp_instance instance : m_vinfo->slp_instances)
4736     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4737       {
4738         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4739         m_partitions[m_vertices[node_i].partition].layout = 0;
4740       }
4741     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4742       {
4743         stmt_vec_info stmt_info
4744           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4745         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4746         if (needs_fold_left_reduction_p (TREE_TYPE
4747                                            (gimple_get_lhs (stmt_info->stmt)),
4748                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4749           {
4750             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4751             m_partitions[m_vertices[node_i].partition].layout = 0;
4752           }
4753       }
4754
4755   /* Check which layouts each node and partition can handle.  Calculate the
4756      weights associated with inserting layout changes on edges.  */
4757   for (unsigned int node_i : m_partitioned_nodes)
4758     {
4759       auto &vertex = m_vertices[node_i];
4760       auto &partition = m_partitions[vertex.partition];
4761       slp_tree node = vertex.node;
4762
4763       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4764         {
4765           vertex.weight = vect_slp_node_weight (node);
4766
4767           /* We do not handle stores with a permutation, so all
4768              incoming permutations must have been materialized.
4769
4770              We also don't handle masked grouped loads, which lack a
4771              permutation vector.  In this case the memory locations
4772              form an implicit second input to the loads, on top of the
4773              explicit mask input, and the memory input's layout cannot
4774              be changed.
4775
4776              On the other hand, we do support permuting gather loads and
4777              masked gather loads, where each scalar load is independent
4778              of the others.  This can be useful if the address/index input
4779              benefits from permutation.  */
4780           if (STMT_VINFO_DATA_REF (rep)
4781               && STMT_VINFO_GROUPED_ACCESS (rep)
4782               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4783             partition.layout = 0;
4784
4785           /* We cannot change the layout of an operation that is
4786              not independent on lanes.  Note this is an explicit
4787              negative list since that's much shorter than the respective
4788              positive one but it's critical to keep maintaining it.  */
4789           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4790             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4791               {
4792               case CFN_COMPLEX_ADD_ROT90:
4793               case CFN_COMPLEX_ADD_ROT270:
4794               case CFN_COMPLEX_MUL:
4795               case CFN_COMPLEX_MUL_CONJ:
4796               case CFN_VEC_ADDSUB:
4797               case CFN_VEC_FMADDSUB:
4798               case CFN_VEC_FMSUBADD:
4799                 partition.layout = 0;
4800               default:;
4801               }
4802         }
4803
4804       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4805         {
4806           auto &other_vertex = m_vertices[other_node_i];
4807
4808           /* Count the number of edges from earlier partitions and the number
4809              of edges to later partitions.  */
4810           if (other_vertex.partition < vertex.partition)
4811             partition.in_degree += 1;
4812           else
4813             partition.out_degree += 1;
4814
4815           /* If the current node uses the result of OTHER_NODE_I, accumulate
4816              the effects of that.  */
4817           if (ud->src == int (node_i))
4818             {
4819               other_vertex.out_weight += vertex.weight;
4820               other_vertex.out_degree += 1;
4821             }
4822         };
4823       for_each_partition_edge (node_i, process_edge);
4824     }
4825 }
4826
4827 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4828    its current (provisional) choice of layout.  The inputs do not necessarily
4829    have the same layout as each other.  */
4830
4831 slpg_layout_cost
4832 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4833 {
4834   auto &vertex = m_vertices[node_i];
4835   slpg_layout_cost cost;
4836   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4837     {
4838       auto &other_vertex = m_vertices[other_node_i];
4839       if (other_vertex.partition < vertex.partition)
4840         {
4841           auto &other_partition = m_partitions[other_vertex.partition];
4842           auto &other_costs = partition_layout_costs (other_vertex.partition,
4843                                                       other_partition.layout);
4844           slpg_layout_cost this_cost = other_costs.in_cost;
4845           this_cost.add_serial_cost (other_costs.internal_cost);
4846           this_cost.split (other_partition.out_degree);
4847           cost.add_parallel_cost (this_cost);
4848         }
4849     };
4850   for_each_partition_edge (node_i, add_cost);
4851   return cost;
4852 }
4853
4854 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4855    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4856    slpg_layout_cost::impossible () if the change isn't possible.  */
4857
4858 slpg_layout_cost
4859 vect_optimize_slp_pass::
4860 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4861                   unsigned int layout2_i)
4862 {
4863   auto &def_vertex = m_vertices[ud->dest];
4864   auto &use_vertex = m_vertices[ud->src];
4865   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4866   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4867   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4868                                     use_layout_i);
4869   if (factor < 0)
4870     return slpg_layout_cost::impossible ();
4871
4872   /* We have a choice of putting the layout change at the site of the
4873      definition or at the site of the use.  Prefer the former when
4874      optimizing for size or when the execution frequency of the
4875      definition is no greater than the combined execution frequencies of
4876      the uses.  When putting the layout change at the site of the definition,
4877      divvy up the cost among all consumers.  */
4878   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4879     {
4880       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4881       cost.split (def_vertex.out_degree);
4882       return cost;
4883     }
4884   return { use_vertex.weight * factor, m_optimize_size };
4885 }
4886
4887 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4888    partition; FROM_NODE_I could be the definition node or the use node.
4889    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4890    Return the cost of any necessary fix-ups on edge UD, or return
4891    slpg_layout_cost::impossible () if the change isn't possible.
4892
4893    At this point, FROM_NODE_I's partition has chosen the cheapest
4894    layout based on the information available so far, but this choice
4895    is only provisional.  */
4896
4897 slpg_layout_cost
4898 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4899                                       unsigned int to_layout_i)
4900 {
4901   auto &from_vertex = m_vertices[from_node_i];
4902   unsigned int from_partition_i = from_vertex.partition;
4903   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4904   gcc_assert (from_partition.layout >= 0);
4905
4906   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4907      with its current layout preference.  */
4908   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4909   auto edge_cost = edge_layout_cost (ud, from_node_i,
4910                                      from_partition.layout, to_layout_i);
4911   if (edge_cost.is_possible ())
4912     {
4913       auto &from_costs = partition_layout_costs (from_partition_i,
4914                                                  from_partition.layout);
4915       cost = from_costs.in_cost;
4916       cost.add_serial_cost (from_costs.internal_cost);
4917       cost.split (from_partition.out_degree);
4918       cost.add_serial_cost (edge_cost);
4919     }
4920
4921   /* Take the minimum of that cost and the cost that applies if
4922      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4923   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4924                                                       to_layout_i);
4925   if (direct_layout_costs.is_possible ())
4926     {
4927       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4928       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4929       direct_cost.split (from_partition.out_degree);
4930       if (!cost.is_possible ()
4931           || direct_cost.is_better_than (cost, m_optimize_size))
4932         cost = direct_cost;
4933     }
4934
4935   return cost;
4936 }
4937
4938 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4939    partition; TO_NODE_I could be the definition node or the use node.
4940    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4941    return the cost of any necessary fix-ups on edge UD, or
4942    slpg_layout_cost::impossible () if the choice cannot be made.
4943
4944    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4945
4946 slpg_layout_cost
4947 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4948                                        unsigned int from_layout_i)
4949 {
4950   auto &to_vertex = m_vertices[to_node_i];
4951   unsigned int to_partition_i = to_vertex.partition;
4952   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4953   gcc_assert (to_partition.layout >= 0);
4954
4955   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4956      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4957      any other inputs keep their current choice of layout.  */
4958   auto &to_costs = partition_layout_costs (to_partition_i,
4959                                            to_partition.layout);
4960   if (ud->src == int (to_node_i)
4961       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4962     {
4963       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4964       auto old_layout = from_partition.layout;
4965       from_partition.layout = from_layout_i;
4966       int factor = internal_node_cost (to_vertex.node, -1,
4967                                        to_partition.layout);
4968       from_partition.layout = old_layout;
4969       if (factor >= 0)
4970         {
4971           slpg_layout_cost cost = to_costs.out_cost;
4972           cost.add_serial_cost ({ to_vertex.weight * factor,
4973                                   m_optimize_size });
4974           cost.split (to_partition.in_degree);
4975           return cost;
4976         }
4977     }
4978
4979   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4980   auto edge_cost = edge_layout_cost (ud, to_node_i,
4981                                      to_partition.layout, from_layout_i);
4982   if (edge_cost.is_possible ())
4983     {
4984       slpg_layout_cost cost = to_costs.out_cost;
4985       cost.add_serial_cost (to_costs.internal_cost);
4986       cost.split (to_partition.in_degree);
4987       cost.add_serial_cost (edge_cost);
4988       return cost;
4989     }
4990
4991   return slpg_layout_cost::impossible ();
4992 }
4993
4994 /* Make a forward pass through the partitions, accumulating input costs.
4995    Make a tentative (provisional) choice of layout for each partition,
4996    ensuring that this choice still allows later partitions to keep
4997    their original layout.  */
4998
4999 void
5000 vect_optimize_slp_pass::forward_pass ()
5001 {
5002   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5003        ++partition_i)
5004     {
5005       auto &partition = m_partitions[partition_i];
5006
5007       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5008          the incoming cost that would apply if every predecessor partition
5009          keeps its current layout.  This is used within the loop below.  */
5010       slpg_layout_cost in_cost;
5011       slp_tree single_node = nullptr;
5012       if (partition.node_end == partition.node_begin + 1)
5013         {
5014           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5015           single_node = m_vertices[node_i].node;
5016           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5017             in_cost = total_in_cost (node_i);
5018         }
5019
5020       /* Go through the possible layouts.  Decide which ones are valid
5021          for this partition and record which of the valid layouts has
5022          the lowest cost.  */
5023       unsigned int min_layout_i = 0;
5024       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5025       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5026         {
5027           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5028           if (!layout_costs.is_possible ())
5029             continue;
5030
5031           /* If the recorded layout is already 0 then the layout cannot
5032              change.  */
5033           if (partition.layout == 0 && layout_i != 0)
5034             {
5035               layout_costs.mark_impossible ();
5036               continue;
5037             }
5038
5039           bool is_possible = true;
5040           for (unsigned int order_i = partition.node_begin;
5041                order_i < partition.node_end; ++order_i)
5042             {
5043               unsigned int node_i = m_partitioned_nodes[order_i];
5044               auto &vertex = m_vertices[node_i];
5045
5046               /* Reject the layout if it is individually incompatible
5047                  with any node in the partition.  */
5048               if (!is_compatible_layout (vertex.node, layout_i))
5049                 {
5050                   is_possible = false;
5051                   break;
5052                 }
5053
5054               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5055                 {
5056                   auto &other_vertex = m_vertices[other_node_i];
5057                   if (other_vertex.partition < vertex.partition)
5058                     {
5059                       /* Accumulate the incoming costs from earlier
5060                          partitions, plus the cost of any layout changes
5061                          on UD itself.  */
5062                       auto cost = forward_cost (ud, other_node_i, layout_i);
5063                       if (!cost.is_possible ())
5064                         is_possible = false;
5065                       else
5066                         layout_costs.in_cost.add_parallel_cost (cost);
5067                     }
5068                   else
5069                     /* Reject the layout if it would make layout 0 impossible
5070                        for later partitions.  This amounts to testing that the
5071                        target supports reversing the layout change on edges
5072                        to later partitions.
5073
5074                        In principle, it might be possible to push a layout
5075                        change all the way down a graph, so that it never
5076                        needs to be reversed and so that the target doesn't
5077                        need to support the reverse operation.  But it would
5078                        be awkward to bail out if we hit a partition that
5079                        does not support the new layout, especially since
5080                        we are not dealing with a lattice.  */
5081                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5082                                                      layout_i).is_possible ();
5083                 };
5084               for_each_partition_edge (node_i, add_cost);
5085
5086               /* Accumulate the cost of using LAYOUT_I within NODE,
5087                  both for the inputs and the outputs.  */
5088               int factor = internal_node_cost (vertex.node, layout_i,
5089                                                layout_i);
5090               if (factor < 0)
5091                 {
5092                   is_possible = false;
5093                   break;
5094                 }
5095               else if (factor)
5096                 layout_costs.internal_cost.add_serial_cost
5097                   ({ vertex.weight * factor, m_optimize_size });
5098             }
5099           if (!is_possible)
5100             {
5101               layout_costs.mark_impossible ();
5102               continue;
5103             }
5104
5105           /* Combine the incoming and partition-internal costs.  */
5106           slpg_layout_cost combined_cost = layout_costs.in_cost;
5107           combined_cost.add_serial_cost (layout_costs.internal_cost);
5108
5109           /* If this partition consists of a single VEC_PERM_EXPR, see
5110              if the VEC_PERM_EXPR can be changed to support output layout
5111              LAYOUT_I while keeping all the provisional choices of input
5112              layout.  */
5113           if (single_node
5114               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5115             {
5116               int factor = internal_node_cost (single_node, -1, layout_i);
5117               if (factor >= 0)
5118                 {
5119                   auto weight = m_vertices[single_node->vertex].weight;
5120                   slpg_layout_cost internal_cost
5121                     = { weight * factor, m_optimize_size };
5122
5123                   slpg_layout_cost alt_cost = in_cost;
5124                   alt_cost.add_serial_cost (internal_cost);
5125                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5126                     {
5127                       combined_cost = alt_cost;
5128                       layout_costs.in_cost = in_cost;
5129                       layout_costs.internal_cost = internal_cost;
5130                     }
5131                 }
5132             }
5133
5134           /* Record the layout with the lowest cost.  Prefer layout 0 in
5135              the event of a tie between it and another layout.  */
5136           if (!min_layout_cost.is_possible ()
5137               || combined_cost.is_better_than (min_layout_cost,
5138                                                m_optimize_size))
5139             {
5140               min_layout_i = layout_i;
5141               min_layout_cost = combined_cost;
5142             }
5143         }
5144
5145       /* This loop's handling of earlier partitions should ensure that
5146          choosing the original layout for the current partition is no
5147          less valid than it was in the original graph, even with the
5148          provisional layout choices for those earlier partitions.  */
5149       gcc_assert (min_layout_cost.is_possible ());
5150       partition.layout = min_layout_i;
5151     }
5152 }
5153
5154 /* Make a backward pass through the partitions, accumulating output costs.
5155    Make a final choice of layout for each partition.  */
5156
5157 void
5158 vect_optimize_slp_pass::backward_pass ()
5159 {
5160   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5161     {
5162       auto &partition = m_partitions[partition_i];
5163
5164       unsigned int min_layout_i = 0;
5165       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5166       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5167         {
5168           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5169           if (!layout_costs.is_possible ())
5170             continue;
5171
5172           /* Accumulate the costs from successor partitions.  */
5173           bool is_possible = true;
5174           for (unsigned int order_i = partition.node_begin;
5175                order_i < partition.node_end; ++order_i)
5176             {
5177               unsigned int node_i = m_partitioned_nodes[order_i];
5178               auto &vertex = m_vertices[node_i];
5179               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5180                 {
5181                   auto &other_vertex = m_vertices[other_node_i];
5182                   auto &other_partition = m_partitions[other_vertex.partition];
5183                   if (other_vertex.partition > vertex.partition)
5184                     {
5185                       /* Accumulate the incoming costs from later
5186                          partitions, plus the cost of any layout changes
5187                          on UD itself.  */
5188                       auto cost = backward_cost (ud, other_node_i, layout_i);
5189                       if (!cost.is_possible ())
5190                         is_possible = false;
5191                       else
5192                         layout_costs.out_cost.add_parallel_cost (cost);
5193                     }
5194                   else
5195                     /* Make sure that earlier partitions can (if necessary
5196                        or beneficial) keep the layout that they chose in
5197                        the forward pass.  This ensures that there is at
5198                        least one valid choice of layout.  */
5199                     is_possible &= edge_layout_cost (ud, other_node_i,
5200                                                      other_partition.layout,
5201                                                      layout_i).is_possible ();
5202                 };
5203               for_each_partition_edge (node_i, add_cost);
5204             }
5205           if (!is_possible)
5206             {
5207               layout_costs.mark_impossible ();
5208               continue;
5209             }
5210
5211           /* Locally combine the costs from the forward and backward passes.
5212              (This combined cost is not passed on, since that would lead
5213              to double counting.)  */
5214           slpg_layout_cost combined_cost = layout_costs.in_cost;
5215           combined_cost.add_serial_cost (layout_costs.internal_cost);
5216           combined_cost.add_serial_cost (layout_costs.out_cost);
5217
5218           /* Record the layout with the lowest cost.  Prefer layout 0 in
5219              the event of a tie between it and another layout.  */
5220           if (!min_layout_cost.is_possible ()
5221               || combined_cost.is_better_than (min_layout_cost,
5222                                                m_optimize_size))
5223             {
5224               min_layout_i = layout_i;
5225               min_layout_cost = combined_cost;
5226             }
5227         }
5228
5229       gcc_assert (min_layout_cost.is_possible ());
5230       partition.layout = min_layout_i;
5231     }
5232 }
5233
5234 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5235    NODE already has the layout that was selected for its partition.  */
5236
5237 slp_tree
5238 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5239                                                 unsigned int to_layout_i)
5240 {
5241   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5242   slp_tree result = m_node_layouts[result_i];
5243   if (result)
5244     return result;
5245
5246   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5247       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5248           /* We can't permute vector defs in place.  */
5249           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5250     {
5251       /* If the vector is uniform or unchanged, there's nothing to do.  */
5252       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5253         result = node;
5254       else
5255         {
5256           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5257           result = vect_create_new_slp_node (scalar_ops);
5258           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5259         }
5260     }
5261   else
5262     {
5263       unsigned int partition_i = m_vertices[node->vertex].partition;
5264       unsigned int from_layout_i = m_partitions[partition_i].layout;
5265       if (from_layout_i == to_layout_i)
5266         return node;
5267
5268       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5269          permutation instead of a serial one.  Leave the new permutation
5270          in TMP_PERM on success.  */
5271       auto_lane_permutation_t tmp_perm;
5272       unsigned int num_inputs = 1;
5273       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5274         {
5275           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5276           if (from_layout_i != 0)
5277             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5278           if (to_layout_i != 0)
5279             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5280           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5281                                               tmp_perm,
5282                                               SLP_TREE_CHILDREN (node),
5283                                               false) >= 0)
5284             num_inputs = SLP_TREE_CHILDREN (node).length ();
5285           else
5286             tmp_perm.truncate (0);
5287         }
5288
5289       if (dump_enabled_p ())
5290         {
5291           if (tmp_perm.length () > 0)
5292             dump_printf_loc (MSG_NOTE, vect_location,
5293                              "duplicating permutation node %p with"
5294                              " layout %d\n",
5295                              (void *) node, to_layout_i);
5296           else
5297             dump_printf_loc (MSG_NOTE, vect_location,
5298                              "inserting permutation node in place of %p\n",
5299                              (void *) node);
5300         }
5301
5302       unsigned int num_lanes = SLP_TREE_LANES (node);
5303       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5304       if (SLP_TREE_SCALAR_STMTS (node).length ())
5305         {
5306           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5307           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5308           if (from_layout_i != 0)
5309             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5310           if (to_layout_i != 0)
5311             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5312         }
5313       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5314       SLP_TREE_LANES (result) = num_lanes;
5315       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5316       result->vertex = -1;
5317
5318       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5319       if (tmp_perm.length ())
5320         {
5321           lane_perm.safe_splice (tmp_perm);
5322           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5323         }
5324       else
5325         {
5326           lane_perm.create (num_lanes);
5327           for (unsigned j = 0; j < num_lanes; ++j)
5328             lane_perm.quick_push ({ 0, j });
5329           if (from_layout_i != 0)
5330             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5331           if (to_layout_i != 0)
5332             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5333           SLP_TREE_CHILDREN (result).safe_push (node);
5334         }
5335       for (slp_tree child : SLP_TREE_CHILDREN (result))
5336         child->refcnt++;
5337     }
5338   m_node_layouts[result_i] = result;
5339   return result;
5340 }
5341
5342 /* Apply the chosen vector layouts to the SLP graph.  */
5343
5344 void
5345 vect_optimize_slp_pass::materialize ()
5346 {
5347   /* We no longer need the costs, so avoid having two O(N * P) arrays
5348      live at the same time.  */
5349   m_partition_layout_costs.release ();
5350   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5351
5352   auto_sbitmap fully_folded (m_vertices.length ());
5353   bitmap_clear (fully_folded);
5354   for (unsigned int node_i : m_partitioned_nodes)
5355     {
5356       auto &vertex = m_vertices[node_i];
5357       slp_tree node = vertex.node;
5358       int layout_i = m_partitions[vertex.partition].layout;
5359       gcc_assert (layout_i >= 0);
5360
5361       /* Rearrange the scalar statements to match the chosen layout.  */
5362       if (layout_i > 0)
5363         vect_slp_permute (m_perms[layout_i],
5364                           SLP_TREE_SCALAR_STMTS (node), true);
5365
5366       /* Update load and lane permutations.  */
5367       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5368         {
5369           /* First try to absorb the input vector layouts.  If that fails,
5370              force the inputs to have layout LAYOUT_I too.  We checked that
5371              that was possible before deciding to use nonzero output layouts.
5372              (Note that at this stage we don't really have any guarantee that
5373              the target supports the original VEC_PERM_EXPR.)  */
5374           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5375           auto_lane_permutation_t tmp_perm;
5376           tmp_perm.safe_splice (perm);
5377           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5378           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5379                                               tmp_perm,
5380                                               SLP_TREE_CHILDREN (node),
5381                                               false) >= 0)
5382             {
5383               if (dump_enabled_p ()
5384                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5385                                   perm.begin ()))
5386                 dump_printf_loc (MSG_NOTE, vect_location,
5387                                  "absorbing input layouts into %p\n",
5388                                  (void *) node);
5389               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5390               bitmap_set_bit (fully_folded, node_i);
5391             }
5392           else
5393             {
5394               /* Not MSG_MISSED because it would make no sense to users.  */
5395               if (dump_enabled_p ())
5396                 dump_printf_loc (MSG_NOTE, vect_location,
5397                                  "failed to absorb input layouts into %p\n",
5398                                  (void *) node);
5399               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5400             }
5401         }
5402       else
5403         {
5404           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5405           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5406           if (layout_i > 0)
5407             /* ???  When we handle non-bijective permutes the idea
5408                is that we can force the load-permutation to be
5409                { min, min + 1, min + 2, ... max }.  But then the
5410                scalar defs might no longer match the lane content
5411                which means wrong-code with live lane vectorization.
5412                So we possibly have to have NULL entries for those.  */
5413             vect_slp_permute (m_perms[layout_i], load_perm, true);
5414         }
5415     }
5416
5417   /* Do this before any nodes disappear, since it involves a walk
5418      over the leaves.  */
5419   remove_redundant_permutations ();
5420
5421   /* Replace each child with a correctly laid-out version.  */
5422   for (unsigned int node_i : m_partitioned_nodes)
5423     {
5424       /* Skip nodes that have already been handled above.  */
5425       if (bitmap_bit_p (fully_folded, node_i))
5426         continue;
5427
5428       auto &vertex = m_vertices[node_i];
5429       int in_layout_i = m_partitions[vertex.partition].layout;
5430       gcc_assert (in_layout_i >= 0);
5431
5432       unsigned j;
5433       slp_tree child;
5434       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5435         {
5436           if (!child)
5437             continue;
5438
5439           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5440           if (new_child != child)
5441             {
5442               vect_free_slp_tree (child);
5443               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5444               new_child->refcnt += 1;
5445             }
5446         }
5447     }
5448 }
5449
5450 /* Elide load permutations that are not necessary.  Such permutations might
5451    be pre-existing, rather than created by the layout optimizations.  */
5452
5453 void
5454 vect_optimize_slp_pass::remove_redundant_permutations ()
5455 {
5456   for (unsigned int node_i : m_leafs)
5457     {
5458       slp_tree node = m_vertices[node_i].node;
5459       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5460         continue;
5461
5462       /* In basic block vectorization we allow any subchain of an interleaving
5463          chain.
5464          FORNOW: not in loop SLP because of realignment complications.  */
5465       if (is_a <bb_vec_info> (m_vinfo))
5466         {
5467           bool subchain_p = true;
5468           stmt_vec_info next_load_info = NULL;
5469           stmt_vec_info load_info;
5470           unsigned j;
5471           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5472             {
5473               if (j != 0
5474                   && (next_load_info != load_info
5475                       || DR_GROUP_GAP (load_info) != 1))
5476                 {
5477                   subchain_p = false;
5478                   break;
5479                 }
5480               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5481             }
5482           if (subchain_p)
5483             {
5484               SLP_TREE_LOAD_PERMUTATION (node).release ();
5485               continue;
5486             }
5487         }
5488       else
5489         {
5490           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5491           stmt_vec_info load_info;
5492           bool this_load_permuted = false;
5493           unsigned j;
5494           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5495             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5496               {
5497                 this_load_permuted = true;
5498                 break;
5499               }
5500           /* When this isn't a grouped access we know it's single element
5501              and contiguous.  */
5502           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5503             {
5504               if (!this_load_permuted
5505                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5506                       || SLP_TREE_LANES (node) == 1))
5507                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5508               continue;
5509             }
5510           stmt_vec_info first_stmt_info
5511             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5512           if (!this_load_permuted
5513               /* The load requires permutation when unrolling exposes
5514                  a gap either because the group is larger than the SLP
5515                  group-size or because there is a gap between the groups.  */
5516               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5517                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5518                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5519             {
5520               SLP_TREE_LOAD_PERMUTATION (node).release ();
5521               continue;
5522             }
5523         }
5524     }
5525 }
5526
5527 /* Print the partition graph and layout information to the dump file.  */
5528
5529 void
5530 vect_optimize_slp_pass::dump ()
5531 {
5532   dump_printf_loc (MSG_NOTE, vect_location,
5533                    "SLP optimize permutations:\n");
5534   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5535     {
5536       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5537       const char *sep = "";
5538       for (unsigned int idx : m_perms[layout_i])
5539         {
5540           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5541           sep = ", ";
5542         }
5543       dump_printf (MSG_NOTE, " }\n");
5544     }
5545   dump_printf_loc (MSG_NOTE, vect_location,
5546                    "SLP optimize partitions:\n");
5547   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5548        ++partition_i)
5549     {
5550       auto &partition = m_partitions[partition_i];
5551       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5552       dump_printf_loc (MSG_NOTE, vect_location,
5553                        "  partition %d (layout %d):\n",
5554                        partition_i, partition.layout);
5555       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5556       for (unsigned int order_i = partition.node_begin;
5557            order_i < partition.node_end; ++order_i)
5558         {
5559           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5560           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5561                            (void *) vertex.node);
5562           dump_printf_loc (MSG_NOTE, vect_location,
5563                            "          weight: %f\n",
5564                            vertex.weight.to_double ());
5565           if (vertex.out_degree)
5566             dump_printf_loc (MSG_NOTE, vect_location,
5567                              "          out weight: %f (degree %d)\n",
5568                              vertex.out_weight.to_double (),
5569                              vertex.out_degree);
5570           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5571             dump_printf_loc (MSG_NOTE, vect_location,
5572                              "          op: VEC_PERM_EXPR\n");
5573           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5574             dump_printf_loc (MSG_NOTE, vect_location,
5575                              "          op template: %G", rep->stmt);
5576         }
5577       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5578       for (unsigned int order_i = partition.node_begin;
5579            order_i < partition.node_end; ++order_i)
5580         {
5581           unsigned int node_i = m_partitioned_nodes[order_i];
5582           auto &vertex = m_vertices[node_i];
5583           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5584             {
5585               auto &other_vertex = m_vertices[other_node_i];
5586               if (other_vertex.partition < vertex.partition)
5587                 dump_printf_loc (MSG_NOTE, vect_location,
5588                                  "      - %p [%d] --> %p\n",
5589                                  (void *) other_vertex.node,
5590                                  other_vertex.partition,
5591                                  (void *) vertex.node);
5592               else
5593                 dump_printf_loc (MSG_NOTE, vect_location,
5594                                  "      - %p --> [%d] %p\n",
5595                                  (void *) vertex.node,
5596                                  other_vertex.partition,
5597                                  (void *) other_vertex.node);
5598             };
5599           for_each_partition_edge (node_i, print_edge);
5600         }
5601
5602       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5603         {
5604           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5605           if (layout_costs.is_possible ())
5606             {
5607               dump_printf_loc (MSG_NOTE, vect_location,
5608                                "    layout %d:%s\n", layout_i,
5609                                partition.layout == int (layout_i)
5610                                ? " (*)" : "");
5611               slpg_layout_cost combined_cost = layout_costs.in_cost;
5612               combined_cost.add_serial_cost (layout_costs.internal_cost);
5613               combined_cost.add_serial_cost (layout_costs.out_cost);
5614 #define TEMPLATE "{depth: %f, total: %f}"
5615               dump_printf_loc (MSG_NOTE, vect_location,
5616                                "        " TEMPLATE "\n",
5617                                layout_costs.in_cost.depth.to_double (),
5618                                layout_costs.in_cost.total.to_double ());
5619               dump_printf_loc (MSG_NOTE, vect_location,
5620                                "      + " TEMPLATE "\n",
5621                                layout_costs.internal_cost.depth.to_double (),
5622                                layout_costs.internal_cost.total.to_double ());
5623               dump_printf_loc (MSG_NOTE, vect_location,
5624                                "      + " TEMPLATE "\n",
5625                                layout_costs.out_cost.depth.to_double (),
5626                                layout_costs.out_cost.total.to_double ());
5627               dump_printf_loc (MSG_NOTE, vect_location,
5628                                "      = " TEMPLATE "\n",
5629                                combined_cost.depth.to_double (),
5630                                combined_cost.total.to_double ());
5631 #undef TEMPLATE
5632             }
5633           else
5634             dump_printf_loc (MSG_NOTE, vect_location,
5635                              "    layout %d: rejected\n", layout_i);
5636         }
5637     }
5638 }
5639
5640 /* Main entry point for the SLP graph optimization pass.  */
5641
5642 void
5643 vect_optimize_slp_pass::run ()
5644 {
5645   build_graph ();
5646   create_partitions ();
5647   start_choosing_layouts ();
5648   if (m_perms.length () > 1)
5649     {
5650       forward_pass ();
5651       backward_pass ();
5652       if (dump_enabled_p ())
5653         dump ();
5654       materialize ();
5655       while (!m_perms.is_empty ())
5656         m_perms.pop ().release ();
5657     }
5658   else
5659     remove_redundant_permutations ();
5660   free_graph (m_slpg);
5661 }
5662
5663 /* Optimize the SLP graph of VINFO.  */
5664
5665 void
5666 vect_optimize_slp (vec_info *vinfo)
5667 {
5668   if (vinfo->slp_instances.is_empty ())
5669     return;
5670   vect_optimize_slp_pass (vinfo).run ();
5671 }
5672
5673 /* Gather loads reachable from the individual SLP graph entries.  */
5674
5675 void
5676 vect_gather_slp_loads (vec_info *vinfo)
5677 {
5678   unsigned i;
5679   slp_instance instance;
5680   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5681     {
5682       hash_set<slp_tree> visited;
5683       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5684                              SLP_INSTANCE_TREE (instance), visited);
5685     }
5686 }
5687
5688
5689 /* For each possible SLP instance decide whether to SLP it and calculate overall
5690    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5691    least one instance.  */
5692
5693 bool
5694 vect_make_slp_decision (loop_vec_info loop_vinfo)
5695 {
5696   unsigned int i;
5697   poly_uint64 unrolling_factor = 1;
5698   const vec<slp_instance> &slp_instances
5699     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5700   slp_instance instance;
5701   int decided_to_slp = 0;
5702
5703   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5704
5705   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5706     {
5707       /* FORNOW: SLP if you can.  */
5708       /* All unroll factors have the form:
5709
5710            GET_MODE_SIZE (vinfo->vector_mode) * X
5711
5712          for some rational X, so they must have a common multiple.  */
5713       unrolling_factor
5714         = force_common_multiple (unrolling_factor,
5715                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5716
5717       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5718          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5719          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5720       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5721       decided_to_slp++;
5722     }
5723
5724   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5725
5726   if (decided_to_slp && dump_enabled_p ())
5727     {
5728       dump_printf_loc (MSG_NOTE, vect_location,
5729                        "Decided to SLP %d instances. Unrolling factor ",
5730                        decided_to_slp);
5731       dump_dec (MSG_NOTE, unrolling_factor);
5732       dump_printf (MSG_NOTE, "\n");
5733     }
5734
5735   return (decided_to_slp > 0);
5736 }
5737
5738 /* Private data for vect_detect_hybrid_slp.  */
5739 struct vdhs_data
5740 {
5741   loop_vec_info loop_vinfo;
5742   vec<stmt_vec_info> *worklist;
5743 };
5744
5745 /* Walker for walk_gimple_op.  */
5746
5747 static tree
5748 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5749 {
5750   walk_stmt_info *wi = (walk_stmt_info *)data;
5751   vdhs_data *dat = (vdhs_data *)wi->info;
5752
5753   if (wi->is_lhs)
5754     return NULL_TREE;
5755
5756   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5757   if (!def_stmt_info)
5758     return NULL_TREE;
5759   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5760   if (PURE_SLP_STMT (def_stmt_info))
5761     {
5762       if (dump_enabled_p ())
5763         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5764                          def_stmt_info->stmt);
5765       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5766       dat->worklist->safe_push (def_stmt_info);
5767     }
5768
5769   return NULL_TREE;
5770 }
5771
5772 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5773    if so, otherwise pushing it to WORKLIST.  */
5774
5775 static void
5776 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5777                                vec<stmt_vec_info> &worklist,
5778                                stmt_vec_info stmt_info)
5779 {
5780   if (dump_enabled_p ())
5781     dump_printf_loc (MSG_NOTE, vect_location,
5782                      "Processing hybrid candidate : %G", stmt_info->stmt);
5783   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5784   imm_use_iterator iter2;
5785   ssa_op_iter iter1;
5786   use_operand_p use_p;
5787   def_operand_p def_p;
5788   bool any_def = false;
5789   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5790     {
5791       any_def = true;
5792       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5793         {
5794           if (is_gimple_debug (USE_STMT (use_p)))
5795             continue;
5796           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5797           /* An out-of loop use means this is a loop_vect sink.  */
5798           if (!use_info)
5799             {
5800               if (dump_enabled_p ())
5801                 dump_printf_loc (MSG_NOTE, vect_location,
5802                                  "Found loop_vect sink: %G", stmt_info->stmt);
5803               worklist.safe_push (stmt_info);
5804               return;
5805             }
5806           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5807             {
5808               if (dump_enabled_p ())
5809                 dump_printf_loc (MSG_NOTE, vect_location,
5810                                  "Found loop_vect use: %G", use_info->stmt);
5811               worklist.safe_push (stmt_info);
5812               return;
5813             }
5814         }
5815     }
5816   /* No def means this is a loo_vect sink.  */
5817   if (!any_def)
5818     {
5819       if (dump_enabled_p ())
5820         dump_printf_loc (MSG_NOTE, vect_location,
5821                          "Found loop_vect sink: %G", stmt_info->stmt);
5822       worklist.safe_push (stmt_info);
5823       return;
5824     }
5825   if (dump_enabled_p ())
5826     dump_printf_loc (MSG_NOTE, vect_location,
5827                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5828   STMT_SLP_TYPE (stmt_info) = pure_slp;
5829 }
5830
5831 /* Find stmts that must be both vectorized and SLPed.  */
5832
5833 void
5834 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5835 {
5836   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5837
5838   /* All stmts participating in SLP are marked pure_slp, all other
5839      stmts are loop_vect.
5840      First collect all loop_vect stmts into a worklist.
5841      SLP patterns cause not all original scalar stmts to appear in
5842      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5843      Rectify this here and do a backward walk over the IL only considering
5844      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5845      mark them as pure_slp.  */
5846   auto_vec<stmt_vec_info> worklist;
5847   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5848     {
5849       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5850       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5851            gsi_next (&gsi))
5852         {
5853           gphi *phi = gsi.phi ();
5854           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5855           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5856             maybe_push_to_hybrid_worklist (loop_vinfo,
5857                                            worklist, stmt_info);
5858         }
5859       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5860            gsi_prev (&gsi))
5861         {
5862           gimple *stmt = gsi_stmt (gsi);
5863           if (is_gimple_debug (stmt))
5864             continue;
5865           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5866           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5867             {
5868               for (gimple_stmt_iterator gsi2
5869                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5870                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5871                 {
5872                   stmt_vec_info patt_info
5873                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5874                   if (!STMT_SLP_TYPE (patt_info)
5875                       && STMT_VINFO_RELEVANT (patt_info))
5876                     maybe_push_to_hybrid_worklist (loop_vinfo,
5877                                                    worklist, patt_info);
5878                 }
5879               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5880             }
5881           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5882             maybe_push_to_hybrid_worklist (loop_vinfo,
5883                                            worklist, stmt_info);
5884         }
5885     }
5886
5887   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5888      mark any SLP vectorized stmt as hybrid.
5889      ???  We're visiting def stmts N times (once for each non-SLP and
5890      once for each hybrid-SLP use).  */
5891   walk_stmt_info wi;
5892   vdhs_data dat;
5893   dat.worklist = &worklist;
5894   dat.loop_vinfo = loop_vinfo;
5895   memset (&wi, 0, sizeof (wi));
5896   wi.info = (void *)&dat;
5897   while (!worklist.is_empty ())
5898     {
5899       stmt_vec_info stmt_info = worklist.pop ();
5900       /* Since SSA operands are not set up for pattern stmts we need
5901          to use walk_gimple_op.  */
5902       wi.is_lhs = 0;
5903       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5904       /* For gather/scatter make sure to walk the offset operand, that
5905          can be a scaling and conversion away.  */
5906       gather_scatter_info gs_info;
5907       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5908           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5909         {
5910           int dummy;
5911           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5912         }
5913     }
5914 }
5915
5916
5917 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5918
5919 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5920   : vec_info (vec_info::bb, shared),
5921     bbs (_bbs),
5922     roots (vNULL)
5923 {
5924   for (unsigned i = 0; i < bbs.length (); ++i)
5925     {
5926       if (i != 0)
5927         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5928              gsi_next (&si))
5929           {
5930             gphi *phi = si.phi ();
5931             gimple_set_uid (phi, 0);
5932             add_stmt (phi);
5933           }
5934       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5935            !gsi_end_p (gsi); gsi_next (&gsi))
5936         {
5937           gimple *stmt = gsi_stmt (gsi);
5938           gimple_set_uid (stmt, 0);
5939           if (is_gimple_debug (stmt))
5940             continue;
5941           add_stmt (stmt);
5942         }
5943     }
5944 }
5945
5946
5947 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5948    stmts in the basic block.  */
5949
5950 _bb_vec_info::~_bb_vec_info ()
5951 {
5952   /* Reset region marker.  */
5953   for (unsigned i = 0; i < bbs.length (); ++i)
5954     {
5955       if (i != 0)
5956         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5957              gsi_next (&si))
5958           {
5959             gphi *phi = si.phi ();
5960             gimple_set_uid (phi, -1);
5961           }
5962       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5963            !gsi_end_p (gsi); gsi_next (&gsi))
5964         {
5965           gimple *stmt = gsi_stmt (gsi);
5966           gimple_set_uid (stmt, -1);
5967         }
5968     }
5969
5970   for (unsigned i = 0; i < roots.length (); ++i)
5971     {
5972       roots[i].stmts.release ();
5973       roots[i].roots.release ();
5974       roots[i].remain.release ();
5975     }
5976   roots.release ();
5977 }
5978
5979 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5980    given then that child nodes have already been processed, and that
5981    their def types currently match their SLP node's def type.  */
5982
5983 static bool
5984 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5985                                     slp_instance node_instance,
5986                                     stmt_vector_for_cost *cost_vec)
5987 {
5988   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5989
5990   /* Calculate the number of vector statements to be created for the
5991      scalar stmts in this node.  For SLP reductions it is equal to the
5992      number of vector statements in the children (which has already been
5993      calculated by the recursive call).  Otherwise it is the number of
5994      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5995      VF divided by the number of elements in a vector.  */
5996   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5997       && !STMT_VINFO_DATA_REF (stmt_info)
5998       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5999     {
6000       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6001         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6002           {
6003             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6004               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6005             break;
6006           }
6007     }
6008   else
6009     {
6010       poly_uint64 vf;
6011       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6012         vf = loop_vinfo->vectorization_factor;
6013       else
6014         vf = 1;
6015       unsigned int group_size = SLP_TREE_LANES (node);
6016       tree vectype = SLP_TREE_VECTYPE (node);
6017       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6018         = vect_get_num_vectors (vf * group_size, vectype);
6019     }
6020
6021   /* Handle purely internal nodes.  */
6022   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6023     {
6024       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6025         return false;
6026
6027       stmt_vec_info slp_stmt_info;
6028       unsigned int i;
6029       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6030         {
6031           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6032               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6033                                                node_instance, i,
6034                                                false, cost_vec))
6035             return false;
6036         }
6037       return true;
6038     }
6039
6040   bool dummy;
6041   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6042                             node, node_instance, cost_vec);
6043 }
6044
6045 /* Try to build NODE from scalars, returning true on success.
6046    NODE_INSTANCE is the SLP instance that contains NODE.  */
6047
6048 static bool
6049 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6050                               slp_instance node_instance)
6051 {
6052   stmt_vec_info stmt_info;
6053   unsigned int i;
6054
6055   if (!is_a <bb_vec_info> (vinfo)
6056       || node == SLP_INSTANCE_TREE (node_instance)
6057       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6058       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6059       /* Force the mask use to be built from scalars instead.  */
6060       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6061     return false;
6062
6063   if (dump_enabled_p ())
6064     dump_printf_loc (MSG_NOTE, vect_location,
6065                      "Building vector operands of %p from scalars instead\n",
6066                      (void *) node);
6067
6068   /* Don't remove and free the child nodes here, since they could be
6069      referenced by other structures.  The analysis and scheduling phases
6070      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6071   unsigned int group_size = SLP_TREE_LANES (node);
6072   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6073   /* Invariants get their vector type from the uses.  */
6074   SLP_TREE_VECTYPE (node) = NULL_TREE;
6075   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6076   SLP_TREE_LOAD_PERMUTATION (node).release ();
6077   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6078     {
6079       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6080       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6081     }
6082   return true;
6083 }
6084
6085 /* Return true if all elements of the slice are the same.  */
6086 bool
6087 vect_scalar_ops_slice::all_same_p () const
6088 {
6089   for (unsigned int i = 1; i < length; ++i)
6090     if (!operand_equal_p (op (0), op (i)))
6091       return false;
6092   return true;
6093 }
6094
6095 hashval_t
6096 vect_scalar_ops_slice_hash::hash (const value_type &s)
6097 {
6098   hashval_t hash = 0;
6099   for (unsigned i = 0; i < s.length; ++i)
6100     hash = iterative_hash_expr (s.op (i), hash);
6101   return hash;
6102 }
6103
6104 bool
6105 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6106                                    const compare_type &s2)
6107 {
6108   if (s1.length != s2.length)
6109     return false;
6110   for (unsigned i = 0; i < s1.length; ++i)
6111     if (!operand_equal_p (s1.op (i), s2.op (i)))
6112       return false;
6113   return true;
6114 }
6115
6116 /* Compute the prologue cost for invariant or constant operands represented
6117    by NODE.  */
6118
6119 static void
6120 vect_prologue_cost_for_slp (slp_tree node,
6121                             stmt_vector_for_cost *cost_vec)
6122 {
6123   /* There's a special case of an existing vector, that costs nothing.  */
6124   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6125       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6126     return;
6127   /* Without looking at the actual initializer a vector of
6128      constants can be implemented as load from the constant pool.
6129      When all elements are the same we can use a splat.  */
6130   tree vectype = SLP_TREE_VECTYPE (node);
6131   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6132   unsigned HOST_WIDE_INT const_nunits;
6133   unsigned nelt_limit;
6134   auto ops = &SLP_TREE_SCALAR_OPS (node);
6135   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6136   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6137       && ! multiple_p (const_nunits, group_size))
6138     {
6139       nelt_limit = const_nunits;
6140       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6141       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6142         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6143           starts.quick_push (i * const_nunits);
6144     }
6145   else
6146     {
6147       /* If either the vector has variable length or the vectors
6148          are composed of repeated whole groups we only need to
6149          cost construction once.  All vectors will be the same.  */
6150       nelt_limit = group_size;
6151       starts.quick_push (0);
6152     }
6153   /* ???  We're just tracking whether vectors in a single node are the same.
6154      Ideally we'd do something more global.  */
6155   bool passed = false;
6156   for (unsigned int start : starts)
6157     {
6158       vect_cost_for_stmt kind;
6159       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6160         kind = vector_load;
6161       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6162         kind = scalar_to_vec;
6163       else
6164         kind = vec_construct;
6165       /* The target cost hook has no idea which part of the SLP node
6166          we are costing so avoid passing it down more than once.  Pass
6167          it to the first vec_construct or scalar_to_vec part since for those
6168          the x86 backend tries to account for GPR to XMM register moves.  */
6169       record_stmt_cost (cost_vec, 1, kind,
6170                         (kind != vector_load && !passed) ? node : nullptr,
6171                         vectype, 0, vect_prologue);
6172       if (kind != vector_load)
6173         passed = true;
6174     }
6175 }
6176
6177 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6178    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6179
6180    Return true if the operations are supported.  */
6181
6182 static bool
6183 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6184                                   slp_instance node_instance,
6185                                   hash_set<slp_tree> &visited_set,
6186                                   vec<slp_tree> &visited_vec,
6187                                   stmt_vector_for_cost *cost_vec)
6188 {
6189   int i, j;
6190   slp_tree child;
6191
6192   /* Assume we can code-generate all invariants.  */
6193   if (!node
6194       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6195       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6196     return true;
6197
6198   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6199     {
6200       if (dump_enabled_p ())
6201         dump_printf_loc (MSG_NOTE, vect_location,
6202                          "Failed cyclic SLP reference in %p\n", (void *) node);
6203       return false;
6204     }
6205   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6206
6207   /* If we already analyzed the exact same set of scalar stmts we're done.
6208      We share the generated vector stmts for those.  */
6209   if (visited_set.add (node))
6210     return true;
6211   visited_vec.safe_push (node);
6212
6213   bool res = true;
6214   unsigned visited_rec_start = visited_vec.length ();
6215   unsigned cost_vec_rec_start = cost_vec->length ();
6216   bool seen_non_constant_child = false;
6217   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6218     {
6219       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6220                                               visited_set, visited_vec,
6221                                               cost_vec);
6222       if (!res)
6223         break;
6224       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6225         seen_non_constant_child = true;
6226     }
6227   /* We're having difficulties scheduling nodes with just constant
6228      operands and no scalar stmts since we then cannot compute a stmt
6229      insertion place.  */
6230   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6231     {
6232       if (dump_enabled_p ())
6233         dump_printf_loc (MSG_NOTE, vect_location,
6234                          "Cannot vectorize all-constant op node %p\n",
6235                          (void *) node);
6236       res = false;
6237     }
6238
6239   if (res)
6240     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6241                                               cost_vec);
6242   /* If analysis failed we have to pop all recursive visited nodes
6243      plus ourselves.  */
6244   if (!res)
6245     {
6246       while (visited_vec.length () >= visited_rec_start)
6247         visited_set.remove (visited_vec.pop ());
6248       cost_vec->truncate (cost_vec_rec_start);
6249     }
6250
6251   /* When the node can be vectorized cost invariant nodes it references.
6252      This is not done in DFS order to allow the refering node
6253      vectorizable_* calls to nail down the invariant nodes vector type
6254      and possibly unshare it if it needs a different vector type than
6255      other referrers.  */
6256   if (res)
6257     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6258       if (child
6259           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6260               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6261           /* Perform usual caching, note code-generation still
6262              code-gens these nodes multiple times but we expect
6263              to CSE them later.  */
6264           && !visited_set.add (child))
6265         {
6266           visited_vec.safe_push (child);
6267           /* ???  After auditing more code paths make a "default"
6268              and push the vector type from NODE to all children
6269              if it is not already set.  */
6270           /* Compute the number of vectors to be generated.  */
6271           tree vector_type = SLP_TREE_VECTYPE (child);
6272           if (!vector_type)
6273             {
6274               /* For shifts with a scalar argument we don't need
6275                  to cost or code-generate anything.
6276                  ???  Represent this more explicitely.  */
6277               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6278                            == shift_vec_info_type)
6279                           && j == 1);
6280               continue;
6281             }
6282           unsigned group_size = SLP_TREE_LANES (child);
6283           poly_uint64 vf = 1;
6284           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6285             vf = loop_vinfo->vectorization_factor;
6286           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6287             = vect_get_num_vectors (vf * group_size, vector_type);
6288           /* And cost them.  */
6289           vect_prologue_cost_for_slp (child, cost_vec);
6290         }
6291
6292   /* If this node or any of its children can't be vectorized, try pruning
6293      the tree here rather than felling the whole thing.  */
6294   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6295     {
6296       /* We'll need to revisit this for invariant costing and number
6297          of vectorized stmt setting.   */
6298       res = true;
6299     }
6300
6301   return res;
6302 }
6303
6304 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6305    region and that can be vectorized using vectorizable_live_operation
6306    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6307    scalar code computing it to be retained.  */
6308
6309 static void
6310 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6311                              slp_instance instance,
6312                              stmt_vector_for_cost *cost_vec,
6313                              hash_set<stmt_vec_info> &svisited,
6314                              hash_set<slp_tree> &visited)
6315 {
6316   if (visited.add (node))
6317     return;
6318
6319   unsigned i;
6320   stmt_vec_info stmt_info;
6321   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6322   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6323     {
6324       if (svisited.contains (stmt_info))
6325         continue;
6326       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6327       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6328           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6329         /* Only the pattern root stmt computes the original scalar value.  */
6330         continue;
6331       bool mark_visited = true;
6332       gimple *orig_stmt = orig_stmt_info->stmt;
6333       ssa_op_iter op_iter;
6334       def_operand_p def_p;
6335       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6336         {
6337           imm_use_iterator use_iter;
6338           gimple *use_stmt;
6339           stmt_vec_info use_stmt_info;
6340           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6341             if (!is_gimple_debug (use_stmt))
6342               {
6343                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6344                 if (!use_stmt_info
6345                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6346                   {
6347                     STMT_VINFO_LIVE_P (stmt_info) = true;
6348                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6349                                                      node, instance, i,
6350                                                      false, cost_vec))
6351                       /* ???  So we know we can vectorize the live stmt
6352                          from one SLP node.  If we cannot do so from all
6353                          or none consistently we'd have to record which
6354                          SLP node (and lane) we want to use for the live
6355                          operation.  So make sure we can code-generate
6356                          from all nodes.  */
6357                       mark_visited = false;
6358                     else
6359                       STMT_VINFO_LIVE_P (stmt_info) = false;
6360                     break;
6361                   }
6362               }
6363           /* We have to verify whether we can insert the lane extract
6364              before all uses.  The following is a conservative approximation.
6365              We cannot put this into vectorizable_live_operation because
6366              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6367              doesn't work.
6368              Note that while the fact that we emit code for loads at the
6369              first load should make this a non-problem leafs we construct
6370              from scalars are vectorized after the last scalar def.
6371              ???  If we'd actually compute the insert location during
6372              analysis we could use sth less conservative than the last
6373              scalar stmt in the node for the dominance check.  */
6374           /* ???  What remains is "live" uses in vector CTORs in the same
6375              SLP graph which is where those uses can end up code-generated
6376              right after their definition instead of close to their original
6377              use.  But that would restrict us to code-generate lane-extracts
6378              from the latest stmt in a node.  So we compensate for this
6379              during code-generation, simply not replacing uses for those
6380              hopefully rare cases.  */
6381           if (STMT_VINFO_LIVE_P (stmt_info))
6382             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6383               if (!is_gimple_debug (use_stmt)
6384                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6385                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6386                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6387                 {
6388                   if (dump_enabled_p ())
6389                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6390                                      "Cannot determine insertion place for "
6391                                      "lane extract\n");
6392                   STMT_VINFO_LIVE_P (stmt_info) = false;
6393                   mark_visited = true;
6394                 }
6395         }
6396       if (mark_visited)
6397         svisited.add (stmt_info);
6398     }
6399
6400   slp_tree child;
6401   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6402     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6403       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6404                                    cost_vec, svisited, visited);
6405 }
6406
6407 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6408
6409 static bool
6410 vectorizable_bb_reduc_epilogue (slp_instance instance,
6411                                 stmt_vector_for_cost *cost_vec)
6412 {
6413   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6414   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6415   if (reduc_code == MINUS_EXPR)
6416     reduc_code = PLUS_EXPR;
6417   internal_fn reduc_fn;
6418   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6419   if (!vectype
6420       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6421       || reduc_fn == IFN_LAST
6422       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6423       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6424                                      TREE_TYPE (vectype)))
6425     {
6426       if (dump_enabled_p ())
6427         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428                          "not vectorized: basic block reduction epilogue "
6429                          "operation unsupported.\n");
6430       return false;
6431     }
6432
6433   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6434      cost log2 vector operations plus shuffles and one extraction.  */
6435   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6436   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6437                     vectype, 0, vect_body);
6438   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6439                     vectype, 0, vect_body);
6440   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6441                     vectype, 0, vect_body);
6442
6443   /* Since we replace all stmts of a possibly longer scalar reduction
6444      chain account for the extra scalar stmts for that.  */
6445   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6446                     instance->root_stmts[0], 0, vect_body);
6447   return true;
6448 }
6449
6450 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6451    and recurse to children.  */
6452
6453 static void
6454 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6455                               hash_set<slp_tree> &visited)
6456 {
6457   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6458       || visited.add (node))
6459     return;
6460
6461   stmt_vec_info stmt;
6462   unsigned i;
6463   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6464     roots.remove (vect_orig_stmt (stmt));
6465
6466   slp_tree child;
6467   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6468     if (child)
6469       vect_slp_prune_covered_roots (child, roots, visited);
6470 }
6471
6472 /* Analyze statements in SLP instances of VINFO.  Return true if the
6473    operations are supported. */
6474
6475 bool
6476 vect_slp_analyze_operations (vec_info *vinfo)
6477 {
6478   slp_instance instance;
6479   int i;
6480
6481   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6482
6483   hash_set<slp_tree> visited;
6484   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6485     {
6486       auto_vec<slp_tree> visited_vec;
6487       stmt_vector_for_cost cost_vec;
6488       cost_vec.create (2);
6489       if (is_a <bb_vec_info> (vinfo))
6490         vect_location = instance->location ();
6491       if (!vect_slp_analyze_node_operations (vinfo,
6492                                              SLP_INSTANCE_TREE (instance),
6493                                              instance, visited, visited_vec,
6494                                              &cost_vec)
6495           /* CTOR instances require vectorized defs for the SLP tree root.  */
6496           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6497               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6498                   != vect_internal_def
6499                   /* Make sure we vectorized with the expected type.  */
6500                   || !useless_type_conversion_p
6501                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6502                                               (instance->root_stmts[0]->stmt))),
6503                          TREE_TYPE (SLP_TREE_VECTYPE
6504                                             (SLP_INSTANCE_TREE (instance))))))
6505           /* Check we can vectorize the reduction.  */
6506           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6507               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6508         {
6509           slp_tree node = SLP_INSTANCE_TREE (instance);
6510           stmt_vec_info stmt_info;
6511           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6512             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6513           else
6514             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6515           if (dump_enabled_p ())
6516             dump_printf_loc (MSG_NOTE, vect_location,
6517                              "removing SLP instance operations starting from: %G",
6518                              stmt_info->stmt);
6519           vect_free_slp_instance (instance);
6520           vinfo->slp_instances.ordered_remove (i);
6521           cost_vec.release ();
6522           while (!visited_vec.is_empty ())
6523             visited.remove (visited_vec.pop ());
6524         }
6525       else
6526         {
6527           i++;
6528           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6529             {
6530               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6531               cost_vec.release ();
6532             }
6533           else
6534             /* For BB vectorization remember the SLP graph entry
6535                cost for later.  */
6536             instance->cost_vec = cost_vec;
6537         }
6538     }
6539
6540   /* Now look for SLP instances with a root that are covered by other
6541      instances and remove them.  */
6542   hash_set<stmt_vec_info> roots;
6543   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6544     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6545       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6546   if (!roots.is_empty ())
6547     {
6548       visited.empty ();
6549       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6550         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6551                                       visited);
6552       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6553         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6554             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6555           {
6556             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6557             if (dump_enabled_p ())
6558               dump_printf_loc (MSG_NOTE, vect_location,
6559                                "removing SLP instance operations starting "
6560                                "from: %G", root->stmt);
6561             vect_free_slp_instance (instance);
6562             vinfo->slp_instances.ordered_remove (i);
6563           }
6564         else
6565           ++i;
6566     }
6567
6568   /* Compute vectorizable live stmts.  */
6569   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6570     {
6571       hash_set<stmt_vec_info> svisited;
6572       hash_set<slp_tree> visited;
6573       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6574         {
6575           vect_location = instance->location ();
6576           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6577                                        instance, &instance->cost_vec, svisited,
6578                                        visited);
6579         }
6580     }
6581
6582   return !vinfo->slp_instances.is_empty ();
6583 }
6584
6585 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6586    closing the eventual chain.  */
6587
6588 static slp_instance
6589 get_ultimate_leader (slp_instance instance,
6590                      hash_map<slp_instance, slp_instance> &instance_leader)
6591 {
6592   auto_vec<slp_instance *, 8> chain;
6593   slp_instance *tem;
6594   while (*(tem = instance_leader.get (instance)) != instance)
6595     {
6596       chain.safe_push (tem);
6597       instance = *tem;
6598     }
6599   while (!chain.is_empty ())
6600     *chain.pop () = instance;
6601   return instance;
6602 }
6603
6604 namespace {
6605 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6606    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6607    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6608
6609    INSTANCE_LEADER is as for get_ultimate_leader.  */
6610
6611 template<typename T>
6612 bool
6613 vect_map_to_instance (slp_instance instance, T key,
6614                       hash_map<T, slp_instance> &key_to_instance,
6615                       hash_map<slp_instance, slp_instance> &instance_leader)
6616 {
6617   bool existed_p;
6618   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6619   if (!existed_p)
6620     ;
6621   else if (key_instance != instance)
6622     {
6623       /* If we're running into a previously marked key make us the
6624          leader of the current ultimate leader.  This keeps the
6625          leader chain acyclic and works even when the current instance
6626          connects two previously independent graph parts.  */
6627       slp_instance key_leader
6628         = get_ultimate_leader (key_instance, instance_leader);
6629       if (key_leader != instance)
6630         instance_leader.put (key_leader, instance);
6631     }
6632   key_instance = instance;
6633   return existed_p;
6634 }
6635 }
6636
6637 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6638
6639 static void
6640 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6641                            slp_instance instance, slp_tree node,
6642                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6643                            hash_map<slp_tree, slp_instance> &node_to_instance,
6644                            hash_map<slp_instance, slp_instance> &instance_leader)
6645 {
6646   stmt_vec_info stmt_info;
6647   unsigned i;
6648
6649   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6650     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6651                           instance_leader);
6652
6653   if (vect_map_to_instance (instance, node, node_to_instance,
6654                             instance_leader))
6655     return;
6656
6657   slp_tree child;
6658   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6659     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6660       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6661                                  node_to_instance, instance_leader);
6662 }
6663
6664 /* Partition the SLP graph into pieces that can be costed independently.  */
6665
6666 static void
6667 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6668 {
6669   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6670
6671   /* First walk the SLP graph assigning each involved scalar stmt a
6672      corresponding SLP graph entry and upon visiting a previously
6673      marked stmt, make the stmts leader the current SLP graph entry.  */
6674   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6675   hash_map<slp_tree, slp_instance> node_to_instance;
6676   hash_map<slp_instance, slp_instance> instance_leader;
6677   slp_instance instance;
6678   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6679     {
6680       instance_leader.put (instance, instance);
6681       vect_bb_partition_graph_r (bb_vinfo,
6682                                  instance, SLP_INSTANCE_TREE (instance),
6683                                  stmt_to_instance, node_to_instance,
6684                                  instance_leader);
6685     }
6686
6687   /* Then collect entries to each independent subgraph.  */
6688   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6689     {
6690       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6691       leader->subgraph_entries.safe_push (instance);
6692       if (dump_enabled_p ()
6693           && leader != instance)
6694         dump_printf_loc (MSG_NOTE, vect_location,
6695                          "instance %p is leader of %p\n",
6696                          (void *) leader, (void *) instance);
6697     }
6698 }
6699
6700 /* Compute the set of scalar stmts participating in internal and external
6701    nodes.  */
6702
6703 static void
6704 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6705                                          hash_set<slp_tree> &visited,
6706                                          hash_set<stmt_vec_info> &vstmts,
6707                                          hash_set<stmt_vec_info> &estmts)
6708 {
6709   int i;
6710   stmt_vec_info stmt_info;
6711   slp_tree child;
6712
6713   if (visited.add (node))
6714     return;
6715
6716   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6717     {
6718       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6719         vstmts.add (stmt_info);
6720
6721       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6722         if (child)
6723           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6724                                                    vstmts, estmts);
6725     }
6726   else
6727     for (tree def : SLP_TREE_SCALAR_OPS (node))
6728       {
6729         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6730         if (def_stmt)
6731           estmts.add (def_stmt);
6732       }
6733 }
6734
6735
6736 /* Compute the scalar cost of the SLP node NODE and its children
6737    and return it.  Do not account defs that are marked in LIFE and
6738    update LIFE according to uses of NODE.  */
6739
6740 static void
6741 vect_bb_slp_scalar_cost (vec_info *vinfo,
6742                          slp_tree node, vec<bool, va_heap> *life,
6743                          stmt_vector_for_cost *cost_vec,
6744                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6745                          hash_set<slp_tree> &visited)
6746 {
6747   unsigned i;
6748   stmt_vec_info stmt_info;
6749   slp_tree child;
6750
6751   if (visited.add (node))
6752     return;
6753
6754   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6755     {
6756       ssa_op_iter op_iter;
6757       def_operand_p def_p;
6758
6759       if ((*life)[i])
6760         continue;
6761
6762       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6763       gimple *orig_stmt = orig_stmt_info->stmt;
6764
6765       /* If there is a non-vectorized use of the defs then the scalar
6766          stmt is kept live in which case we do not account it or any
6767          required defs in the SLP children in the scalar cost.  This
6768          way we make the vectorization more costly when compared to
6769          the scalar cost.  */
6770       if (!STMT_VINFO_LIVE_P (stmt_info))
6771         {
6772           auto_vec<gimple *, 8> worklist;
6773           hash_set<gimple *> *worklist_visited = NULL;
6774           worklist.quick_push (orig_stmt);
6775           do
6776             {
6777               gimple *work_stmt = worklist.pop ();
6778               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6779                 {
6780                   imm_use_iterator use_iter;
6781                   gimple *use_stmt;
6782                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6783                                          DEF_FROM_PTR (def_p))
6784                     if (!is_gimple_debug (use_stmt))
6785                       {
6786                         stmt_vec_info use_stmt_info
6787                           = vinfo->lookup_stmt (use_stmt);
6788                         if (!use_stmt_info
6789                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6790                           {
6791                             if (use_stmt_info
6792                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6793                               {
6794                                 /* For stmts participating in patterns we have
6795                                    to check its uses recursively.  */
6796                                 if (!worklist_visited)
6797                                   worklist_visited = new hash_set<gimple *> ();
6798                                 if (!worklist_visited->add (use_stmt))
6799                                   worklist.safe_push (use_stmt);
6800                                 continue;
6801                               }
6802                             (*life)[i] = true;
6803                             goto next_lane;
6804                           }
6805                       }
6806                 }
6807             }
6808           while (!worklist.is_empty ());
6809 next_lane:
6810           if (worklist_visited)
6811             delete worklist_visited;
6812           if ((*life)[i])
6813             continue;
6814         }
6815
6816       /* Count scalar stmts only once.  */
6817       if (gimple_visited_p (orig_stmt))
6818         continue;
6819       gimple_set_visited (orig_stmt, true);
6820
6821       vect_cost_for_stmt kind;
6822       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6823         {
6824           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6825             kind = scalar_load;
6826           else
6827             kind = scalar_store;
6828         }
6829       else if (vect_nop_conversion_p (orig_stmt_info))
6830         continue;
6831       /* For single-argument PHIs assume coalescing which means zero cost
6832          for the scalar and the vector PHIs.  This avoids artificially
6833          favoring the vector path (but may pessimize it in some cases).  */
6834       else if (is_a <gphi *> (orig_stmt_info->stmt)
6835                && gimple_phi_num_args
6836                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6837         continue;
6838       else
6839         kind = scalar_stmt;
6840       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6841                         SLP_TREE_VECTYPE (node), 0, vect_body);
6842     }
6843
6844   auto_vec<bool, 20> subtree_life;
6845   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6846     {
6847       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6848         {
6849           /* Do not directly pass LIFE to the recursive call, copy it to
6850              confine changes in the callee to the current child/subtree.  */
6851           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6852             {
6853               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6854               for (unsigned j = 0;
6855                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6856                 {
6857                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6858                   if (perm.first == i)
6859                     subtree_life[perm.second] = (*life)[j];
6860                 }
6861             }
6862           else
6863             {
6864               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6865               subtree_life.safe_splice (*life);
6866             }
6867           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6868                                    vectorized_scalar_stmts, visited);
6869           subtree_life.truncate (0);
6870         }
6871     }
6872 }
6873
6874 /* Comparator for the loop-index sorted cost vectors.  */
6875
6876 static int
6877 li_cost_vec_cmp (const void *a_, const void *b_)
6878 {
6879   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6880   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6881   if (a->first < b->first)
6882     return -1;
6883   else if (a->first == b->first)
6884     return 0;
6885   return 1;
6886 }
6887
6888 /* Check if vectorization of the basic block is profitable for the
6889    subgraph denoted by SLP_INSTANCES.  */
6890
6891 static bool
6892 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6893                                     vec<slp_instance> slp_instances,
6894                                     loop_p orig_loop)
6895 {
6896   slp_instance instance;
6897   int i;
6898   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6899   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6900
6901   if (dump_enabled_p ())
6902     {
6903       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6904       hash_set<slp_tree> visited;
6905       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6906         vect_print_slp_graph (MSG_NOTE, vect_location,
6907                               SLP_INSTANCE_TREE (instance), visited);
6908     }
6909
6910   /* Compute the set of scalar stmts we know will go away 'locally' when
6911      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6912      not accurate for nodes promoted extern late or for scalar stmts that
6913      are used both in extern defs and in vectorized defs.  */
6914   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6915   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6916   hash_set<slp_tree> visited;
6917   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6918     {
6919       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6920                                                SLP_INSTANCE_TREE (instance),
6921                                                visited,
6922                                                vectorized_scalar_stmts,
6923                                                scalar_stmts_in_externs);
6924       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6925         vectorized_scalar_stmts.add (rstmt);
6926     }
6927   /* Scalar stmts used as defs in external nodes need to be preseved, so
6928      remove them from vectorized_scalar_stmts.  */
6929   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6930     vectorized_scalar_stmts.remove (stmt);
6931
6932   /* Calculate scalar cost and sum the cost for the vector stmts
6933      previously collected.  */
6934   stmt_vector_for_cost scalar_costs = vNULL;
6935   stmt_vector_for_cost vector_costs = vNULL;
6936   visited.empty ();
6937   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6938     {
6939       auto_vec<bool, 20> life;
6940       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6941                               true);
6942       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6943         record_stmt_cost (&scalar_costs,
6944                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6945                           scalar_stmt,
6946                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6947       vect_bb_slp_scalar_cost (bb_vinfo,
6948                                SLP_INSTANCE_TREE (instance),
6949                                &life, &scalar_costs, vectorized_scalar_stmts,
6950                                visited);
6951       vector_costs.safe_splice (instance->cost_vec);
6952       instance->cost_vec.release ();
6953     }
6954
6955   if (dump_enabled_p ())
6956     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6957
6958   /* When costing non-loop vectorization we need to consider each covered
6959      loop independently and make sure vectorization is profitable.  For
6960      now we assume a loop may be not entered or executed an arbitrary
6961      number of iterations (???  static information can provide more
6962      precise info here) which means we can simply cost each containing
6963      loops stmts separately.  */
6964
6965   /* First produce cost vectors sorted by loop index.  */
6966   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6967     li_scalar_costs (scalar_costs.length ());
6968   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6969     li_vector_costs (vector_costs.length ());
6970   stmt_info_for_cost *cost;
6971   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6972     {
6973       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6974       li_scalar_costs.quick_push (std::make_pair (l, cost));
6975     }
6976   /* Use a random used loop as fallback in case the first vector_costs
6977      entry does not have a stmt_info associated with it.  */
6978   unsigned l = li_scalar_costs[0].first;
6979   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6980     {
6981       /* We inherit from the previous COST, invariants, externals and
6982          extracts immediately follow the cost for the related stmt.  */
6983       if (cost->stmt_info)
6984         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6985       li_vector_costs.quick_push (std::make_pair (l, cost));
6986     }
6987   li_scalar_costs.qsort (li_cost_vec_cmp);
6988   li_vector_costs.qsort (li_cost_vec_cmp);
6989
6990   /* Now cost the portions individually.  */
6991   unsigned vi = 0;
6992   unsigned si = 0;
6993   bool profitable = true;
6994   while (si < li_scalar_costs.length ()
6995          && vi < li_vector_costs.length ())
6996     {
6997       unsigned sl = li_scalar_costs[si].first;
6998       unsigned vl = li_vector_costs[vi].first;
6999       if (sl != vl)
7000         {
7001           if (dump_enabled_p ())
7002             dump_printf_loc (MSG_NOTE, vect_location,
7003                              "Scalar %d and vector %d loop part do not "
7004                              "match up, skipping scalar part\n", sl, vl);
7005           /* Skip the scalar part, assuming zero cost on the vector side.  */
7006           do
7007             {
7008               si++;
7009             }
7010           while (si < li_scalar_costs.length ()
7011                  && li_scalar_costs[si].first == sl);
7012           continue;
7013         }
7014
7015       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7016       do
7017         {
7018           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7019           si++;
7020         }
7021       while (si < li_scalar_costs.length ()
7022              && li_scalar_costs[si].first == sl);
7023       unsigned dummy;
7024       finish_cost (scalar_target_cost_data, nullptr,
7025                    &dummy, &scalar_cost, &dummy);
7026
7027       /* Complete the target-specific vector cost calculation.  */
7028       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7029       do
7030         {
7031           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7032           vi++;
7033         }
7034       while (vi < li_vector_costs.length ()
7035              && li_vector_costs[vi].first == vl);
7036       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7037                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7038       delete scalar_target_cost_data;
7039       delete vect_target_cost_data;
7040
7041       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7042
7043       if (dump_enabled_p ())
7044         {
7045           dump_printf_loc (MSG_NOTE, vect_location,
7046                            "Cost model analysis for part in loop %d:\n", sl);
7047           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7048                        vec_inside_cost + vec_outside_cost);
7049           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7050         }
7051
7052       /* Vectorization is profitable if its cost is more than the cost of scalar
7053          version.  Note that we err on the vector side for equal cost because
7054          the cost estimate is otherwise quite pessimistic (constant uses are
7055          free on the scalar side but cost a load on the vector side for
7056          example).  */
7057       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7058         {
7059           profitable = false;
7060           break;
7061         }
7062     }
7063   if (profitable && vi < li_vector_costs.length ())
7064     {
7065       if (dump_enabled_p ())
7066         dump_printf_loc (MSG_NOTE, vect_location,
7067                          "Excess vector cost for part in loop %d:\n",
7068                          li_vector_costs[vi].first);
7069       profitable = false;
7070     }
7071
7072   /* Unset visited flag.  This is delayed when the subgraph is profitable
7073      and we process the loop for remaining unvectorized if-converted code.  */
7074   if (!orig_loop || !profitable)
7075     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7076       gimple_set_visited  (cost->stmt_info->stmt, false);
7077
7078   scalar_costs.release ();
7079   vector_costs.release ();
7080
7081   return profitable;
7082 }
7083
7084 /* qsort comparator for lane defs.  */
7085
7086 static int
7087 vld_cmp (const void *a_, const void *b_)
7088 {
7089   auto *a = (const std::pair<unsigned, tree> *)a_;
7090   auto *b = (const std::pair<unsigned, tree> *)b_;
7091   return a->first - b->first;
7092 }
7093
7094 /* Return true if USE_STMT is a vector lane insert into VEC and set
7095    *THIS_LANE to the lane number that is set.  */
7096
7097 static bool
7098 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7099 {
7100   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7101   if (!use_ass
7102       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7103       || (vec
7104           ? gimple_assign_rhs1 (use_ass) != vec
7105           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7106       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7107                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7108       || !constant_multiple_p
7109             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7110              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7111              this_lane))
7112     return false;
7113   return true;
7114 }
7115
7116 /* Find any vectorizable constructors and add them to the grouped_store
7117    array.  */
7118
7119 static void
7120 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7121 {
7122   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7123     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7124          !gsi_end_p (gsi); gsi_next (&gsi))
7125     {
7126       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7127       if (!assign)
7128         continue;
7129
7130       tree rhs = gimple_assign_rhs1 (assign);
7131       enum tree_code code = gimple_assign_rhs_code (assign);
7132       use_operand_p use_p;
7133       gimple *use_stmt;
7134       if (code == CONSTRUCTOR)
7135         {
7136           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7137               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7138                            CONSTRUCTOR_NELTS (rhs))
7139               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7140               || uniform_vector_p (rhs))
7141             continue;
7142
7143           unsigned j;
7144           tree val;
7145           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7146             if (TREE_CODE (val) != SSA_NAME
7147                 || !bb_vinfo->lookup_def (val))
7148               break;
7149           if (j != CONSTRUCTOR_NELTS (rhs))
7150             continue;
7151
7152           vec<stmt_vec_info> roots = vNULL;
7153           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7154           vec<stmt_vec_info> stmts;
7155           stmts.create (CONSTRUCTOR_NELTS (rhs));
7156           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7157             stmts.quick_push
7158               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7159           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7160                                                stmts, roots));
7161         }
7162       else if (code == BIT_INSERT_EXPR
7163                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7164                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7165                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7166                && integer_zerop (gimple_assign_rhs3 (assign))
7167                && useless_type_conversion_p
7168                     (TREE_TYPE (TREE_TYPE (rhs)),
7169                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7170                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7171         {
7172           /* We start to match on insert to lane zero but since the
7173              inserts need not be ordered we'd have to search both
7174              the def and the use chains.  */
7175           tree vectype = TREE_TYPE (rhs);
7176           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7177           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7178           auto_sbitmap lanes (nlanes);
7179           bitmap_clear (lanes);
7180           bitmap_set_bit (lanes, 0);
7181           tree def = gimple_assign_lhs (assign);
7182           lane_defs.quick_push
7183                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7184           unsigned lanes_found = 1;
7185           /* Start with the use chains, the last stmt will be the root.  */
7186           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7187           vec<stmt_vec_info> roots = vNULL;
7188           roots.safe_push (last);
7189           do
7190             {
7191               use_operand_p use_p;
7192               gimple *use_stmt;
7193               if (!single_imm_use (def, &use_p, &use_stmt))
7194                 break;
7195               unsigned this_lane;
7196               if (!bb_vinfo->lookup_stmt (use_stmt)
7197                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7198                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7199                 break;
7200               if (bitmap_bit_p (lanes, this_lane))
7201                 break;
7202               lanes_found++;
7203               bitmap_set_bit (lanes, this_lane);
7204               gassign *use_ass = as_a <gassign *> (use_stmt);
7205               lane_defs.quick_push (std::make_pair
7206                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7207               last = bb_vinfo->lookup_stmt (use_ass);
7208               roots.safe_push (last);
7209               def = gimple_assign_lhs (use_ass);
7210             }
7211           while (lanes_found < nlanes);
7212           if (roots.length () > 1)
7213             std::swap(roots[0], roots[roots.length () - 1]);
7214           if (lanes_found < nlanes)
7215             {
7216               /* Now search the def chain.  */
7217               def = gimple_assign_rhs1 (assign);
7218               do
7219                 {
7220                   if (TREE_CODE (def) != SSA_NAME
7221                       || !has_single_use (def))
7222                     break;
7223                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7224                   unsigned this_lane;
7225                   if (!bb_vinfo->lookup_stmt (def_stmt)
7226                       || !vect_slp_is_lane_insert (def_stmt,
7227                                                    NULL_TREE, &this_lane)
7228                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7229                     break;
7230                   if (bitmap_bit_p (lanes, this_lane))
7231                     break;
7232                   lanes_found++;
7233                   bitmap_set_bit (lanes, this_lane);
7234                   lane_defs.quick_push (std::make_pair
7235                                           (this_lane,
7236                                            gimple_assign_rhs2 (def_stmt)));
7237                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7238                   def = gimple_assign_rhs1 (def_stmt);
7239                 }
7240               while (lanes_found < nlanes);
7241             }
7242           if (lanes_found == nlanes)
7243             {
7244               /* Sort lane_defs after the lane index and register the root.  */
7245               lane_defs.qsort (vld_cmp);
7246               vec<stmt_vec_info> stmts;
7247               stmts.create (nlanes);
7248               for (unsigned i = 0; i < nlanes; ++i)
7249                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7250               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7251                                                    stmts, roots));
7252             }
7253           else
7254             roots.release ();
7255         }
7256       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7257                && (associative_tree_code (code) || code == MINUS_EXPR)
7258                /* ???  This pessimizes a two-element reduction.  PR54400.
7259                   ???  In-order reduction could be handled if we only
7260                   traverse one operand chain in vect_slp_linearize_chain.  */
7261                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7262                /* Ops with constants at the tail can be stripped here.  */
7263                && TREE_CODE (rhs) == SSA_NAME
7264                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7265                /* Should be the chain end.  */
7266                && (!single_imm_use (gimple_assign_lhs (assign),
7267                                     &use_p, &use_stmt)
7268                    || !is_gimple_assign (use_stmt)
7269                    || (gimple_assign_rhs_code (use_stmt) != code
7270                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7271                            || (gimple_assign_rhs_code (use_stmt)
7272                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7273         {
7274           /* We start the match at the end of a possible association
7275              chain.  */
7276           auto_vec<chain_op_t> chain;
7277           auto_vec<std::pair<tree_code, gimple *> > worklist;
7278           auto_vec<gimple *> chain_stmts;
7279           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7280           if (code == MINUS_EXPR)
7281             code = PLUS_EXPR;
7282           internal_fn reduc_fn;
7283           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7284               || reduc_fn == IFN_LAST)
7285             continue;
7286           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7287                                     /* ??? */
7288                                     code_stmt, alt_code_stmt, &chain_stmts);
7289           if (chain.length () > 1)
7290             {
7291               /* Sort the chain according to def_type and operation.  */
7292               chain.sort (dt_sort_cmp, bb_vinfo);
7293               /* ???  Now we'd want to strip externals and constants
7294                  but record those to be handled in the epilogue.  */
7295               /* ???  For now do not allow mixing ops or externs/constants.  */
7296               bool invalid = false;
7297               unsigned remain_cnt = 0;
7298               for (unsigned i = 0; i < chain.length (); ++i)
7299                 {
7300                   if (chain[i].code != code)
7301                     {
7302                       invalid = true;
7303                       break;
7304                     }
7305                   if (chain[i].dt != vect_internal_def)
7306                     remain_cnt++;
7307                 }
7308               if (!invalid && chain.length () - remain_cnt > 1)
7309                 {
7310                   vec<stmt_vec_info> stmts;
7311                   vec<tree> remain = vNULL;
7312                   stmts.create (chain.length ());
7313                   if (remain_cnt > 0)
7314                     remain.create (remain_cnt);
7315                   for (unsigned i = 0; i < chain.length (); ++i)
7316                     {
7317                       if (chain[i].dt == vect_internal_def)
7318                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7319                       else
7320                         remain.quick_push (chain[i].op);
7321                     }
7322                   vec<stmt_vec_info> roots;
7323                   roots.create (chain_stmts.length ());
7324                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7325                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7326                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7327                                                        stmts, roots, remain));
7328                 }
7329             }
7330         }
7331     }
7332 }
7333
7334 /* Walk the grouped store chains and replace entries with their
7335    pattern variant if any.  */
7336
7337 static void
7338 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7339 {
7340   stmt_vec_info first_element;
7341   unsigned i;
7342
7343   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7344     {
7345       /* We also have CTORs in this array.  */
7346       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7347         continue;
7348       if (STMT_VINFO_IN_PATTERN_P (first_element))
7349         {
7350           stmt_vec_info orig = first_element;
7351           first_element = STMT_VINFO_RELATED_STMT (first_element);
7352           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7353           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7354           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7355           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7356           vinfo->grouped_stores[i] = first_element;
7357         }
7358       stmt_vec_info prev = first_element;
7359       while (DR_GROUP_NEXT_ELEMENT (prev))
7360         {
7361           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7362           if (STMT_VINFO_IN_PATTERN_P (elt))
7363             {
7364               stmt_vec_info orig = elt;
7365               elt = STMT_VINFO_RELATED_STMT (elt);
7366               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7367               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7368               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7369             }
7370           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7371           prev = elt;
7372         }
7373     }
7374 }
7375
7376 /* Check if the region described by BB_VINFO can be vectorized, returning
7377    true if so.  When returning false, set FATAL to true if the same failure
7378    would prevent vectorization at other vector sizes, false if it is still
7379    worth trying other sizes.  N_STMTS is the number of statements in the
7380    region.  */
7381
7382 static bool
7383 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7384                        vec<int> *dataref_groups)
7385 {
7386   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7387
7388   slp_instance instance;
7389   int i;
7390   poly_uint64 min_vf = 2;
7391
7392   /* The first group of checks is independent of the vector size.  */
7393   fatal = true;
7394
7395   /* Analyze the data references.  */
7396
7397   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7398     {
7399       if (dump_enabled_p ())
7400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7401                          "not vectorized: unhandled data-ref in basic "
7402                          "block.\n");
7403       return false;
7404     }
7405
7406   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7407     {
7408      if (dump_enabled_p ())
7409        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7410                         "not vectorized: unhandled data access in "
7411                         "basic block.\n");
7412       return false;
7413     }
7414
7415   vect_slp_check_for_roots (bb_vinfo);
7416
7417   /* If there are no grouped stores and no constructors in the region
7418      there is no need to continue with pattern recog as vect_analyze_slp
7419      will fail anyway.  */
7420   if (bb_vinfo->grouped_stores.is_empty ()
7421       && bb_vinfo->roots.is_empty ())
7422     {
7423       if (dump_enabled_p ())
7424         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7425                          "not vectorized: no grouped stores in "
7426                          "basic block.\n");
7427       return false;
7428     }
7429
7430   /* While the rest of the analysis below depends on it in some way.  */
7431   fatal = false;
7432
7433   vect_pattern_recog (bb_vinfo);
7434
7435   /* Update store groups from pattern processing.  */
7436   vect_fixup_store_groups_with_patterns (bb_vinfo);
7437
7438   /* Check the SLP opportunities in the basic block, analyze and build SLP
7439      trees.  */
7440   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7441     {
7442       if (dump_enabled_p ())
7443         {
7444           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445                            "Failed to SLP the basic block.\n");
7446           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7447                            "not vectorized: failed to find SLP opportunities "
7448                            "in basic block.\n");
7449         }
7450       return false;
7451     }
7452
7453   /* Optimize permutations.  */
7454   vect_optimize_slp (bb_vinfo);
7455
7456   /* Gather the loads reachable from the SLP graph entries.  */
7457   vect_gather_slp_loads (bb_vinfo);
7458
7459   vect_record_base_alignments (bb_vinfo);
7460
7461   /* Analyze and verify the alignment of data references and the
7462      dependence in the SLP instances.  */
7463   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7464     {
7465       vect_location = instance->location ();
7466       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7467           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7468         {
7469           slp_tree node = SLP_INSTANCE_TREE (instance);
7470           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7471           if (dump_enabled_p ())
7472             dump_printf_loc (MSG_NOTE, vect_location,
7473                              "removing SLP instance operations starting from: %G",
7474                              stmt_info->stmt);
7475           vect_free_slp_instance (instance);
7476           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7477           continue;
7478         }
7479
7480       /* Mark all the statements that we want to vectorize as pure SLP and
7481          relevant.  */
7482       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7483       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7484       unsigned j;
7485       stmt_vec_info root;
7486       /* Likewise consider instance root stmts as vectorized.  */
7487       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7488         STMT_SLP_TYPE (root) = pure_slp;
7489
7490       i++;
7491     }
7492   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7493     return false;
7494
7495   if (!vect_slp_analyze_operations (bb_vinfo))
7496     {
7497       if (dump_enabled_p ())
7498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7499                          "not vectorized: bad operation in basic block.\n");
7500       return false;
7501     }
7502
7503   vect_bb_partition_graph (bb_vinfo);
7504
7505   return true;
7506 }
7507
7508 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7509    basic blocks in BBS, returning true on success.
7510    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7511
7512 static bool
7513 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7514                  vec<int> *dataref_groups, unsigned int n_stmts,
7515                  loop_p orig_loop)
7516 {
7517   bb_vec_info bb_vinfo;
7518   auto_vector_modes vector_modes;
7519
7520   /* Autodetect first vector size we try.  */
7521   machine_mode next_vector_mode = VOIDmode;
7522   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7523   unsigned int mode_i = 0;
7524
7525   vec_info_shared shared;
7526
7527   machine_mode autodetected_vector_mode = VOIDmode;
7528   while (1)
7529     {
7530       bool vectorized = false;
7531       bool fatal = false;
7532       bb_vinfo = new _bb_vec_info (bbs, &shared);
7533
7534       bool first_time_p = shared.datarefs.is_empty ();
7535       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7536       if (first_time_p)
7537         bb_vinfo->shared->save_datarefs ();
7538       else
7539         bb_vinfo->shared->check_datarefs ();
7540       bb_vinfo->vector_mode = next_vector_mode;
7541
7542       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7543         {
7544           if (dump_enabled_p ())
7545             {
7546               dump_printf_loc (MSG_NOTE, vect_location,
7547                                "***** Analysis succeeded with vector mode"
7548                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7549               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7550             }
7551
7552           bb_vinfo->shared->check_datarefs ();
7553
7554           auto_vec<slp_instance> profitable_subgraphs;
7555           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7556             {
7557               if (instance->subgraph_entries.is_empty ())
7558                 continue;
7559
7560               dump_user_location_t saved_vect_location = vect_location;
7561               vect_location = instance->location ();
7562               if (!unlimited_cost_model (NULL)
7563                   && !vect_bb_vectorization_profitable_p
7564                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7565                 {
7566                   if (dump_enabled_p ())
7567                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7568                                      "not vectorized: vectorization is not "
7569                                      "profitable.\n");
7570                   vect_location = saved_vect_location;
7571                   continue;
7572                 }
7573
7574               vect_location = saved_vect_location;
7575               if (!dbg_cnt (vect_slp))
7576                 continue;
7577
7578               profitable_subgraphs.safe_push (instance);
7579             }
7580
7581           /* When we're vectorizing an if-converted loop body make sure
7582              we vectorized all if-converted code.  */
7583           if (!profitable_subgraphs.is_empty ()
7584               && orig_loop)
7585             {
7586               gcc_assert (bb_vinfo->bbs.length () == 1);
7587               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7588                    !gsi_end_p (gsi); gsi_next (&gsi))
7589                 {
7590                   /* The costing above left us with DCEable vectorized scalar
7591                      stmts having the visited flag set on profitable
7592                      subgraphs.  Do the delayed clearing of the flag here.  */
7593                   if (gimple_visited_p (gsi_stmt (gsi)))
7594                     {
7595                       gimple_set_visited (gsi_stmt (gsi), false);
7596                       continue;
7597                     }
7598                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7599                     continue;
7600
7601                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7602                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7603                       {
7604                         if (!profitable_subgraphs.is_empty ()
7605                             && dump_enabled_p ())
7606                           dump_printf_loc (MSG_NOTE, vect_location,
7607                                            "not profitable because of "
7608                                            "unprofitable if-converted scalar "
7609                                            "code\n");
7610                         profitable_subgraphs.truncate (0);
7611                       }
7612                 }
7613             }
7614
7615           /* Finally schedule the profitable subgraphs.  */
7616           for (slp_instance instance : profitable_subgraphs)
7617             {
7618               if (!vectorized && dump_enabled_p ())
7619                 dump_printf_loc (MSG_NOTE, vect_location,
7620                                  "Basic block will be vectorized "
7621                                  "using SLP\n");
7622               vectorized = true;
7623
7624               /* Dump before scheduling as store vectorization will remove
7625                  the original stores and mess with the instance tree
7626                  so querying its location will eventually ICE.  */
7627               if (flag_checking)
7628                 for (slp_instance sub : instance->subgraph_entries)
7629                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7630               unsigned HOST_WIDE_INT bytes;
7631               if (dump_enabled_p ())
7632                 for (slp_instance sub : instance->subgraph_entries)
7633                   {
7634                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7635                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7636                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7637                                        sub->location (),
7638                                        "basic block part vectorized using %wu "
7639                                        "byte vectors\n", bytes);
7640                     else
7641                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7642                                        sub->location (),
7643                                        "basic block part vectorized using "
7644                                        "variable length vectors\n");
7645                   }
7646
7647               dump_user_location_t saved_vect_location = vect_location;
7648               vect_location = instance->location ();
7649
7650               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7651
7652               vect_location = saved_vect_location;
7653             }
7654         }
7655       else
7656         {
7657           if (dump_enabled_p ())
7658             dump_printf_loc (MSG_NOTE, vect_location,
7659                              "***** Analysis failed with vector mode %s\n",
7660                              GET_MODE_NAME (bb_vinfo->vector_mode));
7661         }
7662
7663       if (mode_i == 0)
7664         autodetected_vector_mode = bb_vinfo->vector_mode;
7665
7666       if (!fatal)
7667         while (mode_i < vector_modes.length ()
7668                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7669           {
7670             if (dump_enabled_p ())
7671               dump_printf_loc (MSG_NOTE, vect_location,
7672                                "***** The result for vector mode %s would"
7673                                " be the same\n",
7674                                GET_MODE_NAME (vector_modes[mode_i]));
7675             mode_i += 1;
7676           }
7677
7678       delete bb_vinfo;
7679
7680       if (mode_i < vector_modes.length ()
7681           && VECTOR_MODE_P (autodetected_vector_mode)
7682           && (related_vector_mode (vector_modes[mode_i],
7683                                    GET_MODE_INNER (autodetected_vector_mode))
7684               == autodetected_vector_mode)
7685           && (related_vector_mode (autodetected_vector_mode,
7686                                    GET_MODE_INNER (vector_modes[mode_i]))
7687               == vector_modes[mode_i]))
7688         {
7689           if (dump_enabled_p ())
7690             dump_printf_loc (MSG_NOTE, vect_location,
7691                              "***** Skipping vector mode %s, which would"
7692                              " repeat the analysis for %s\n",
7693                              GET_MODE_NAME (vector_modes[mode_i]),
7694                              GET_MODE_NAME (autodetected_vector_mode));
7695           mode_i += 1;
7696         }
7697
7698       if (vectorized
7699           || mode_i == vector_modes.length ()
7700           || autodetected_vector_mode == VOIDmode
7701           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7702              vector sizes will fail do not bother iterating.  */
7703           || fatal)
7704         return vectorized;
7705
7706       /* Try the next biggest vector size.  */
7707       next_vector_mode = vector_modes[mode_i++];
7708       if (dump_enabled_p ())
7709         dump_printf_loc (MSG_NOTE, vect_location,
7710                          "***** Re-trying analysis with vector mode %s\n",
7711                          GET_MODE_NAME (next_vector_mode));
7712     }
7713 }
7714
7715
7716 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7717    true if anything in the basic-block was vectorized.  */
7718
7719 static bool
7720 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7721 {
7722   vec<data_reference_p> datarefs = vNULL;
7723   auto_vec<int> dataref_groups;
7724   int insns = 0;
7725   int current_group = 0;
7726
7727   for (unsigned i = 0; i < bbs.length (); i++)
7728     {
7729       basic_block bb = bbs[i];
7730       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7731            gsi_next (&gsi))
7732         {
7733           gimple *stmt = gsi_stmt (gsi);
7734           if (is_gimple_debug (stmt))
7735             continue;
7736
7737           insns++;
7738
7739           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7740             vect_location = stmt;
7741
7742           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7743                                               &dataref_groups, current_group))
7744             ++current_group;
7745         }
7746       /* New BBs always start a new DR group.  */
7747       ++current_group;
7748     }
7749
7750   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7751 }
7752
7753 /* Special entry for the BB vectorizer.  Analyze and transform a single
7754    if-converted BB with ORIG_LOOPs body being the not if-converted
7755    representation.  Returns true if anything in the basic-block was
7756    vectorized.  */
7757
7758 bool
7759 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7760 {
7761   auto_vec<basic_block> bbs;
7762   bbs.safe_push (bb);
7763   return vect_slp_bbs (bbs, orig_loop);
7764 }
7765
7766 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7767    true if anything in the basic-block was vectorized.  */
7768
7769 bool
7770 vect_slp_function (function *fun)
7771 {
7772   bool r = false;
7773   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7774   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7775
7776   /* For the moment split the function into pieces to avoid making
7777      the iteration on the vector mode moot.  Split at points we know
7778      to not handle well which is CFG merges (SLP discovery doesn't
7779      handle non-loop-header PHIs) and loop exits.  Since pattern
7780      recog requires reverse iteration to visit uses before defs
7781      simply chop RPO into pieces.  */
7782   auto_vec<basic_block> bbs;
7783   for (unsigned i = 0; i < n; i++)
7784     {
7785       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7786       bool split = false;
7787
7788       /* Split when a BB is not dominated by the first block.  */
7789       if (!bbs.is_empty ()
7790           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7791         {
7792           if (dump_enabled_p ())
7793             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7794                              "splitting region at dominance boundary bb%d\n",
7795                              bb->index);
7796           split = true;
7797         }
7798       /* Split when the loop determined by the first block
7799          is exited.  This is because we eventually insert
7800          invariants at region begin.  */
7801       else if (!bbs.is_empty ()
7802                && bbs[0]->loop_father != bb->loop_father
7803                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7804         {
7805           if (dump_enabled_p ())
7806             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7807                              "splitting region at loop %d exit at bb%d\n",
7808                              bbs[0]->loop_father->num, bb->index);
7809           split = true;
7810         }
7811       else if (!bbs.is_empty ()
7812                && bb->loop_father->header == bb
7813                && bb->loop_father->dont_vectorize)
7814         {
7815           if (dump_enabled_p ())
7816             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817                              "splitting region at dont-vectorize loop %d "
7818                              "entry at bb%d\n",
7819                              bb->loop_father->num, bb->index);
7820           split = true;
7821         }
7822
7823       if (split && !bbs.is_empty ())
7824         {
7825           r |= vect_slp_bbs (bbs, NULL);
7826           bbs.truncate (0);
7827         }
7828
7829       if (bbs.is_empty ())
7830         {
7831           /* We need to be able to insert at the head of the region which
7832              we cannot for region starting with a returns-twice call.  */
7833           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7834             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7835               {
7836                 if (dump_enabled_p ())
7837                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7838                                    "skipping bb%d as start of region as it "
7839                                    "starts with returns-twice call\n",
7840                                    bb->index);
7841                 continue;
7842               }
7843           /* If the loop this BB belongs to is marked as not to be vectorized
7844              honor that also for BB vectorization.  */
7845           if (bb->loop_father->dont_vectorize)
7846             continue;
7847         }
7848
7849       bbs.safe_push (bb);
7850
7851       /* When we have a stmt ending this block and defining a
7852          value we have to insert on edges when inserting after it for
7853          a vector containing its definition.  Avoid this for now.  */
7854       if (gimple *last = *gsi_last_bb (bb))
7855         if (gimple_get_lhs (last)
7856             && is_ctrl_altering_stmt (last))
7857           {
7858             if (dump_enabled_p ())
7859               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7860                                "splitting region at control altering "
7861                                "definition %G", last);
7862             r |= vect_slp_bbs (bbs, NULL);
7863             bbs.truncate (0);
7864           }
7865     }
7866
7867   if (!bbs.is_empty ())
7868     r |= vect_slp_bbs (bbs, NULL);
7869
7870   free (rpo);
7871
7872   return r;
7873 }
7874
7875 /* Build a variable-length vector in which the elements in ELTS are repeated
7876    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7877    RESULTS and add any new instructions to SEQ.
7878
7879    The approach we use is:
7880
7881    (1) Find a vector mode VM with integer elements of mode IM.
7882
7883    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7884        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7885        from small vectors to IM.
7886
7887    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7888
7889    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7890        correct byte contents.
7891
7892    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7893
7894    We try to find the largest IM for which this sequence works, in order
7895    to cut down on the number of interleaves.  */
7896
7897 void
7898 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7899                           const vec<tree> &elts, unsigned int nresults,
7900                           vec<tree> &results)
7901 {
7902   unsigned int nelts = elts.length ();
7903   tree element_type = TREE_TYPE (vector_type);
7904
7905   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7906   unsigned int nvectors = 1;
7907   tree new_vector_type;
7908   tree permutes[2];
7909   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7910                                        &nvectors, &new_vector_type,
7911                                        permutes))
7912     gcc_unreachable ();
7913
7914   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7915   unsigned int partial_nelts = nelts / nvectors;
7916   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7917
7918   tree_vector_builder partial_elts;
7919   auto_vec<tree, 32> pieces (nvectors * 2);
7920   pieces.quick_grow_cleared (nvectors * 2);
7921   for (unsigned int i = 0; i < nvectors; ++i)
7922     {
7923       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7924              ELTS' has mode IM.  */
7925       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7926       for (unsigned int j = 0; j < partial_nelts; ++j)
7927         partial_elts.quick_push (elts[i * partial_nelts + j]);
7928       tree t = gimple_build_vector (seq, &partial_elts);
7929       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7930                         TREE_TYPE (new_vector_type), t);
7931
7932       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7933       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7934     }
7935
7936   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7937          correct byte contents.
7938
7939      Conceptually, we need to repeat the following operation log2(nvectors)
7940      times, where hi_start = nvectors / 2:
7941
7942         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7943         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7944
7945      However, if each input repeats every N elements and the VF is
7946      a multiple of N * 2, the HI result is the same as the LO result.
7947      This will be true for the first N1 iterations of the outer loop,
7948      followed by N2 iterations for which both the LO and HI results
7949      are needed.  I.e.:
7950
7951         N1 + N2 = log2(nvectors)
7952
7953      Each "N1 iteration" doubles the number of redundant vectors and the
7954      effect of the process as a whole is to have a sequence of nvectors/2**N1
7955      vectors that repeats 2**N1 times.  Rather than generate these redundant
7956      vectors, we halve the number of vectors for each N1 iteration.  */
7957   unsigned int in_start = 0;
7958   unsigned int out_start = nvectors;
7959   unsigned int new_nvectors = nvectors;
7960   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7961     {
7962       unsigned int hi_start = new_nvectors / 2;
7963       unsigned int out_i = 0;
7964       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7965         {
7966           if ((in_i & 1) != 0
7967               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7968                              2 * in_repeat))
7969             continue;
7970
7971           tree output = make_ssa_name (new_vector_type);
7972           tree input1 = pieces[in_start + (in_i / 2)];
7973           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7974           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7975                                                input1, input2,
7976                                                permutes[in_i & 1]);
7977           gimple_seq_add_stmt (seq, stmt);
7978           pieces[out_start + out_i] = output;
7979           out_i += 1;
7980         }
7981       std::swap (in_start, out_start);
7982       new_nvectors = out_i;
7983     }
7984
7985   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7986   results.reserve (nresults);
7987   for (unsigned int i = 0; i < nresults; ++i)
7988     if (i < new_nvectors)
7989       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7990                                         pieces[in_start + i]));
7991     else
7992       results.quick_push (results[i - new_nvectors]);
7993 }
7994
7995
7996 /* For constant and loop invariant defs in OP_NODE this function creates
7997    vector defs that will be used in the vectorized stmts and stores them
7998    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7999
8000 static void
8001 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8002 {
8003   unsigned HOST_WIDE_INT nunits;
8004   tree vec_cst;
8005   unsigned j, number_of_places_left_in_vector;
8006   tree vector_type;
8007   tree vop;
8008   int group_size = op_node->ops.length ();
8009   unsigned int vec_num, i;
8010   unsigned number_of_copies = 1;
8011   bool constant_p;
8012   gimple_seq ctor_seq = NULL;
8013   auto_vec<tree, 16> permute_results;
8014
8015   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8016   vector_type = SLP_TREE_VECTYPE (op_node);
8017
8018   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8019   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8020   auto_vec<tree> voprnds (number_of_vectors);
8021
8022   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8023      created vectors. It is greater than 1 if unrolling is performed.
8024
8025      For example, we have two scalar operands, s1 and s2 (e.g., group of
8026      strided accesses of size two), while NUNITS is four (i.e., four scalars
8027      of this type can be packed in a vector).  The output vector will contain
8028      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8029      will be 2).
8030
8031      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8032      containing the operands.
8033
8034      For example, NUNITS is four as before, and the group size is 8
8035      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8036      {s5, s6, s7, s8}.  */
8037
8038   /* When using duplicate_and_interleave, we just need one element for
8039      each scalar statement.  */
8040   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8041     nunits = group_size;
8042
8043   number_of_copies = nunits * number_of_vectors / group_size;
8044
8045   number_of_places_left_in_vector = nunits;
8046   constant_p = true;
8047   tree_vector_builder elts (vector_type, nunits, 1);
8048   elts.quick_grow (nunits);
8049   stmt_vec_info insert_after = NULL;
8050   for (j = 0; j < number_of_copies; j++)
8051     {
8052       tree op;
8053       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8054         {
8055           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8056           number_of_places_left_in_vector--;
8057           tree orig_op = op;
8058           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8059             {
8060               if (CONSTANT_CLASS_P (op))
8061                 {
8062                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8063                     {
8064                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8065                          of possibly different sizes of scalar value and
8066                          vector element.  */
8067                       if (integer_zerop (op))
8068                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8069                       else if (integer_onep (op))
8070                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8071                       else
8072                         gcc_unreachable ();
8073                     }
8074                   else
8075                     op = fold_unary (VIEW_CONVERT_EXPR,
8076                                      TREE_TYPE (vector_type), op);
8077                   gcc_assert (op && CONSTANT_CLASS_P (op));
8078                 }
8079               else
8080                 {
8081                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8082                   gimple *init_stmt;
8083                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8084                     {
8085                       tree true_val
8086                         = build_all_ones_cst (TREE_TYPE (vector_type));
8087                       tree false_val
8088                         = build_zero_cst (TREE_TYPE (vector_type));
8089                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8090                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8091                                                        op, true_val,
8092                                                        false_val);
8093                     }
8094                   else
8095                     {
8096                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8097                                    op);
8098                       init_stmt
8099                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8100                                                op);
8101                     }
8102                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8103                   op = new_temp;
8104                 }
8105             }
8106           elts[number_of_places_left_in_vector] = op;
8107           if (!CONSTANT_CLASS_P (op))
8108             constant_p = false;
8109           /* For BB vectorization we have to compute an insert location
8110              when a def is inside the analyzed region since we cannot
8111              simply insert at the BB start in this case.  */
8112           stmt_vec_info opdef;
8113           if (TREE_CODE (orig_op) == SSA_NAME
8114               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8115               && is_a <bb_vec_info> (vinfo)
8116               && (opdef = vinfo->lookup_def (orig_op)))
8117             {
8118               if (!insert_after)
8119                 insert_after = opdef;
8120               else
8121                 insert_after = get_later_stmt (insert_after, opdef);
8122             }
8123
8124           if (number_of_places_left_in_vector == 0)
8125             {
8126               if (constant_p
8127                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8128                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8129                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8130               else
8131                 {
8132                   if (permute_results.is_empty ())
8133                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8134                                               elts, number_of_vectors,
8135                                               permute_results);
8136                   vec_cst = permute_results[number_of_vectors - j - 1];
8137                 }
8138               if (!gimple_seq_empty_p (ctor_seq))
8139                 {
8140                   if (insert_after)
8141                     {
8142                       gimple_stmt_iterator gsi;
8143                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8144                         {
8145                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8146                           gsi_insert_seq_before (&gsi, ctor_seq,
8147                                                  GSI_CONTINUE_LINKING);
8148                         }
8149                       else if (!stmt_ends_bb_p (insert_after->stmt))
8150                         {
8151                           gsi = gsi_for_stmt (insert_after->stmt);
8152                           gsi_insert_seq_after (&gsi, ctor_seq,
8153                                                 GSI_CONTINUE_LINKING);
8154                         }
8155                       else
8156                         {
8157                           /* When we want to insert after a def where the
8158                              defining stmt throws then insert on the fallthru
8159                              edge.  */
8160                           edge e = find_fallthru_edge
8161                                      (gimple_bb (insert_after->stmt)->succs);
8162                           basic_block new_bb
8163                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8164                           gcc_assert (!new_bb);
8165                         }
8166                     }
8167                   else
8168                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8169                   ctor_seq = NULL;
8170                 }
8171               voprnds.quick_push (vec_cst);
8172               insert_after = NULL;
8173               number_of_places_left_in_vector = nunits;
8174               constant_p = true;
8175               elts.new_vector (vector_type, nunits, 1);
8176               elts.quick_grow (nunits);
8177             }
8178         }
8179     }
8180
8181   /* Since the vectors are created in the reverse order, we should invert
8182      them.  */
8183   vec_num = voprnds.length ();
8184   for (j = vec_num; j != 0; j--)
8185     {
8186       vop = voprnds[j - 1];
8187       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8188     }
8189
8190   /* In case that VF is greater than the unrolling factor needed for the SLP
8191      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8192      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8193      to replicate the vectors.  */
8194   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8195     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8196          i++)
8197       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8198 }
8199
8200 /* Get the Ith vectorized definition from SLP_NODE.  */
8201
8202 tree
8203 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8204 {
8205   return SLP_TREE_VEC_DEFS (slp_node)[i];
8206 }
8207
8208 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8209
8210 void
8211 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8212 {
8213   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8214   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8215 }
8216
8217 /* Get N vectorized definitions for SLP_NODE.  */
8218
8219 void
8220 vect_get_slp_defs (vec_info *,
8221                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8222 {
8223   if (n == -1U)
8224     n = SLP_TREE_CHILDREN (slp_node).length ();
8225
8226   for (unsigned i = 0; i < n; ++i)
8227     {
8228       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8229       vec<tree> vec_defs = vNULL;
8230       vect_get_slp_defs (child, &vec_defs);
8231       vec_oprnds->quick_push (vec_defs);
8232     }
8233 }
8234
8235 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8236    - PERM gives the permutation that the caller wants to use for NODE,
8237      which might be different from SLP_LOAD_PERMUTATION.
8238    - DUMP_P controls whether the function dumps information.  */
8239
8240 static bool
8241 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8242                                 load_permutation_t &perm,
8243                                 const vec<tree> &dr_chain,
8244                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8245                                 bool analyze_only, bool dump_p,
8246                                 unsigned *n_perms, unsigned int *n_loads,
8247                                 bool dce_chain)
8248 {
8249   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8250   int vec_index = 0;
8251   tree vectype = SLP_TREE_VECTYPE (node);
8252   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8253   unsigned int mask_element;
8254   unsigned dr_group_size;
8255   machine_mode mode;
8256
8257   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8258     dr_group_size = 1;
8259   else
8260     {
8261       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8262       dr_group_size = DR_GROUP_SIZE (stmt_info);
8263     }
8264
8265   mode = TYPE_MODE (vectype);
8266   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8267   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8268
8269   /* Initialize the vect stmts of NODE to properly insert the generated
8270      stmts later.  */
8271   if (! analyze_only)
8272     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8273       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8274
8275   /* Generate permutation masks for every NODE. Number of masks for each NODE
8276      is equal to GROUP_SIZE.
8277      E.g., we have a group of three nodes with three loads from the same
8278      location in each node, and the vector size is 4. I.e., we have a
8279      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8280      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8281      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8282      ...
8283
8284      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8285      The last mask is illegal since we assume two operands for permute
8286      operation, and the mask element values can't be outside that range.
8287      Hence, the last mask must be converted into {2,5,5,5}.
8288      For the first two permutations we need the first and the second input
8289      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8290      we need the second and the third vectors: {b1,c1,a2,b2} and
8291      {c2,a3,b3,c3}.  */
8292
8293   int vect_stmts_counter = 0;
8294   unsigned int index = 0;
8295   int first_vec_index = -1;
8296   int second_vec_index = -1;
8297   bool noop_p = true;
8298   *n_perms = 0;
8299
8300   vec_perm_builder mask;
8301   unsigned int nelts_to_build;
8302   unsigned int nvectors_per_build;
8303   unsigned int in_nlanes;
8304   bool repeating_p = (group_size == dr_group_size
8305                       && multiple_p (nunits, group_size));
8306   if (repeating_p)
8307     {
8308       /* A single vector contains a whole number of copies of the node, so:
8309          (a) all permutes can use the same mask; and
8310          (b) the permutes only need a single vector input.  */
8311       mask.new_vector (nunits, group_size, 3);
8312       nelts_to_build = mask.encoded_nelts ();
8313       /* It's possible to obtain zero nstmts during analyze_only, so make
8314          it at least one to ensure the later computation for n_perms
8315          proceed.  */
8316       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8317       in_nlanes = dr_group_size * 3;
8318     }
8319   else
8320     {
8321       /* We need to construct a separate mask for each vector statement.  */
8322       unsigned HOST_WIDE_INT const_nunits, const_vf;
8323       if (!nunits.is_constant (&const_nunits)
8324           || !vf.is_constant (&const_vf))
8325         return false;
8326       mask.new_vector (const_nunits, const_nunits, 1);
8327       nelts_to_build = const_vf * group_size;
8328       nvectors_per_build = 1;
8329       in_nlanes = const_vf * dr_group_size;
8330     }
8331   auto_sbitmap used_in_lanes (in_nlanes);
8332   bitmap_clear (used_in_lanes);
8333   auto_bitmap used_defs;
8334
8335   unsigned int count = mask.encoded_nelts ();
8336   mask.quick_grow (count);
8337   vec_perm_indices indices;
8338
8339   for (unsigned int j = 0; j < nelts_to_build; j++)
8340     {
8341       unsigned int iter_num = j / group_size;
8342       unsigned int stmt_num = j % group_size;
8343       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8344       bitmap_set_bit (used_in_lanes, i);
8345       if (repeating_p)
8346         {
8347           first_vec_index = 0;
8348           mask_element = i;
8349         }
8350       else
8351         {
8352           /* Enforced before the loop when !repeating_p.  */
8353           unsigned int const_nunits = nunits.to_constant ();
8354           vec_index = i / const_nunits;
8355           mask_element = i % const_nunits;
8356           if (vec_index == first_vec_index
8357               || first_vec_index == -1)
8358             {
8359               first_vec_index = vec_index;
8360             }
8361           else if (vec_index == second_vec_index
8362                    || second_vec_index == -1)
8363             {
8364               second_vec_index = vec_index;
8365               mask_element += const_nunits;
8366             }
8367           else
8368             {
8369               if (dump_p)
8370                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8371                                  "permutation requires at "
8372                                  "least three vectors %G",
8373                                  stmt_info->stmt);
8374               gcc_assert (analyze_only);
8375               return false;
8376             }
8377
8378           gcc_assert (mask_element < 2 * const_nunits);
8379         }
8380
8381       if (mask_element != index)
8382         noop_p = false;
8383       mask[index++] = mask_element;
8384
8385       if (index == count)
8386         {
8387           if (!noop_p)
8388             {
8389               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8390               if (!can_vec_perm_const_p (mode, mode, indices))
8391                 {
8392                   if (dump_p)
8393                     {
8394                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8395                                        "unsupported vect permute { ");
8396                       for (i = 0; i < count; ++i)
8397                         {
8398                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8399                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8400                         }
8401                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8402                     }
8403                   gcc_assert (analyze_only);
8404                   return false;
8405                 }
8406
8407               tree mask_vec = NULL_TREE;
8408               if (!analyze_only)
8409                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8410
8411               if (second_vec_index == -1)
8412                 second_vec_index = first_vec_index;
8413
8414               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8415                 {
8416                   ++*n_perms;
8417                   if (analyze_only)
8418                     continue;
8419                   /* Generate the permute statement if necessary.  */
8420                   tree first_vec = dr_chain[first_vec_index + ri];
8421                   tree second_vec = dr_chain[second_vec_index + ri];
8422                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8423                   tree perm_dest
8424                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8425                                                    vectype);
8426                   perm_dest = make_ssa_name (perm_dest);
8427                   gimple *perm_stmt
8428                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8429                                            second_vec, mask_vec);
8430                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8431                                                gsi);
8432                   if (dce_chain)
8433                     {
8434                       bitmap_set_bit (used_defs, first_vec_index + ri);
8435                       bitmap_set_bit (used_defs, second_vec_index + ri);
8436                     }
8437
8438                   /* Store the vector statement in NODE.  */
8439                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8440                 }
8441             }
8442           else if (!analyze_only)
8443             {
8444               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8445                 {
8446                   tree first_vec = dr_chain[first_vec_index + ri];
8447                   /* If mask was NULL_TREE generate the requested
8448                      identity transform.  */
8449                   if (dce_chain)
8450                     bitmap_set_bit (used_defs, first_vec_index + ri);
8451
8452                   /* Store the vector statement in NODE.  */
8453                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8454                 }
8455             }
8456
8457           index = 0;
8458           first_vec_index = -1;
8459           second_vec_index = -1;
8460           noop_p = true;
8461         }
8462     }
8463
8464   if (n_loads)
8465     {
8466       if (repeating_p)
8467         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8468       else
8469         {
8470           /* Enforced above when !repeating_p.  */
8471           unsigned int const_nunits = nunits.to_constant ();
8472           *n_loads = 0;
8473           bool load_seen = false;
8474           for (unsigned i = 0; i < in_nlanes; ++i)
8475             {
8476               if (i % const_nunits == 0)
8477                 {
8478                   if (load_seen)
8479                     *n_loads += 1;
8480                   load_seen = false;
8481                 }
8482               if (bitmap_bit_p (used_in_lanes, i))
8483                 load_seen = true;
8484             }
8485           if (load_seen)
8486             *n_loads += 1;
8487         }
8488     }
8489
8490   if (dce_chain)
8491     for (unsigned i = 0; i < dr_chain.length (); ++i)
8492       if (!bitmap_bit_p (used_defs, i))
8493         {
8494           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8495           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8496           gsi_remove (&rgsi, true);
8497           release_defs (stmt);
8498         }
8499
8500   return true;
8501 }
8502
8503 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8504    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8505    permute statements for the SLP node NODE.  Store the number of vector
8506    permute instructions in *N_PERMS and the number of vector load
8507    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8508    that were not needed.  */
8509
8510 bool
8511 vect_transform_slp_perm_load (vec_info *vinfo,
8512                               slp_tree node, const vec<tree> &dr_chain,
8513                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8514                               bool analyze_only, unsigned *n_perms,
8515                               unsigned int *n_loads, bool dce_chain)
8516 {
8517   return vect_transform_slp_perm_load_1 (vinfo, node,
8518                                          SLP_TREE_LOAD_PERMUTATION (node),
8519                                          dr_chain, gsi, vf, analyze_only,
8520                                          dump_enabled_p (), n_perms, n_loads,
8521                                          dce_chain);
8522 }
8523
8524 /* Produce the next vector result for SLP permutation NODE by adding a vector
8525    statement at GSI.  If MASK_VEC is nonnull, add:
8526
8527       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8528
8529    otherwise add:
8530
8531       <new SSA name> = FIRST_DEF.  */
8532
8533 static void
8534 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8535                           slp_tree node, tree first_def, tree second_def,
8536                           tree mask_vec, poly_uint64 identity_offset)
8537 {
8538   tree vectype = SLP_TREE_VECTYPE (node);
8539
8540   /* ???  We SLP match existing vector element extracts but
8541      allow punning which we need to re-instantiate at uses
8542      but have no good way of explicitly representing.  */
8543   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8544       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8545     {
8546       gassign *conv_stmt
8547         = gimple_build_assign (make_ssa_name (vectype),
8548                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8549       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8550       first_def = gimple_assign_lhs (conv_stmt);
8551     }
8552   gassign *perm_stmt;
8553   tree perm_dest = make_ssa_name (vectype);
8554   if (mask_vec)
8555     {
8556       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8557                            TYPE_SIZE (vectype))
8558           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8559         {
8560           gassign *conv_stmt
8561             = gimple_build_assign (make_ssa_name (vectype),
8562                                    build1 (VIEW_CONVERT_EXPR,
8563                                            vectype, second_def));
8564           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8565           second_def = gimple_assign_lhs (conv_stmt);
8566         }
8567       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8568                                        first_def, second_def,
8569                                        mask_vec);
8570     }
8571   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8572     {
8573       /* For identity permutes we still need to handle the case
8574          of offsetted extracts or concats.  */
8575       unsigned HOST_WIDE_INT c;
8576       auto first_def_nunits
8577         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8578       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8579         {
8580           unsigned HOST_WIDE_INT elsz
8581             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8582           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8583                                  TYPE_SIZE (vectype),
8584                                  bitsize_int (identity_offset * elsz));
8585           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8586         }
8587       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8588                                     first_def_nunits, &c) && c == 2)
8589         {
8590           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8591                                             NULL_TREE, second_def);
8592           perm_stmt = gimple_build_assign (perm_dest, ctor);
8593         }
8594       else
8595         gcc_unreachable ();
8596     }
8597   else
8598     {
8599       /* We need a copy here in case the def was external.  */
8600       perm_stmt = gimple_build_assign (perm_dest, first_def);
8601     }
8602   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8603   /* Store the vector statement in NODE.  */
8604   node->push_vec_def (perm_stmt);
8605 }
8606
8607 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8608    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8609    If GSI is nonnull, emit the permutation there.
8610
8611    When GSI is null, the only purpose of NODE is to give properties
8612    of the result, such as the vector type and number of SLP lanes.
8613    The node does not need to be a VEC_PERM_EXPR.
8614
8615    If the target supports the operation, return the number of individual
8616    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8617    dump file if DUMP_P is true.  */
8618
8619 static int
8620 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8621                                 slp_tree node, lane_permutation_t &perm,
8622                                 vec<slp_tree> &children, bool dump_p)
8623 {
8624   tree vectype = SLP_TREE_VECTYPE (node);
8625
8626   /* ???  We currently only support all same vector input types
8627      while the SLP IL should really do a concat + select and thus accept
8628      arbitrary mismatches.  */
8629   slp_tree child;
8630   unsigned i;
8631   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8632   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8633   tree op_vectype = NULL_TREE;
8634   FOR_EACH_VEC_ELT (children, i, child)
8635     if (SLP_TREE_VECTYPE (child))
8636       {
8637         op_vectype = SLP_TREE_VECTYPE (child);
8638         break;
8639       }
8640   if (!op_vectype)
8641     op_vectype = vectype;
8642   FOR_EACH_VEC_ELT (children, i, child)
8643     {
8644       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8645            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8646           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8647           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8648         {
8649           if (dump_p)
8650             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8651                              "Unsupported vector types in lane permutation\n");
8652           return -1;
8653         }
8654       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8655         repeating_p = false;
8656     }
8657
8658   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8659   if (dump_p)
8660     {
8661       dump_printf_loc (MSG_NOTE, vect_location,
8662                        "vectorizing permutation");
8663       for (unsigned i = 0; i < perm.length (); ++i)
8664         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8665       if (repeating_p)
8666         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8667       dump_printf (MSG_NOTE, "\n");
8668     }
8669
8670   /* REPEATING_P is true if every output vector is guaranteed to use the
8671      same permute vector.  We can handle that case for both variable-length
8672      and constant-length vectors, but we only handle other cases for
8673      constant-length vectors.
8674
8675      Set:
8676
8677      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8678        mask vector that we want to build.
8679
8680      - NCOPIES to the number of copies of PERM that we need in order
8681        to build the necessary permute mask vectors.
8682
8683      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8684        for each permute mask vector.  This is only relevant when GSI is
8685        nonnull.  */
8686   uint64_t npatterns;
8687   unsigned nelts_per_pattern;
8688   uint64_t ncopies;
8689   unsigned noutputs_per_mask;
8690   if (repeating_p)
8691     {
8692       /* We need a single permute mask vector that has the form:
8693
8694            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8695
8696          In other words, the original n-element permute in PERM is
8697          "unrolled" to fill a full vector.  The stepped vector encoding
8698          that we use for permutes requires 3n elements.  */
8699       npatterns = SLP_TREE_LANES (node);
8700       nelts_per_pattern = ncopies = 3;
8701       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8702     }
8703   else
8704     {
8705       /* Calculate every element of every permute mask vector explicitly,
8706          instead of relying on the pattern described above.  */
8707       if (!nunits.is_constant (&npatterns))
8708         return -1;
8709       nelts_per_pattern = ncopies = 1;
8710       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8711         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8712           return -1;
8713       noutputs_per_mask = 1;
8714     }
8715   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8716   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8717
8718   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8719      from the { SLP operand, scalar lane } permutation as recorded in the
8720      SLP node as intermediate step.  This part should already work
8721      with SLP children with arbitrary number of lanes.  */
8722   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8723   auto_vec<unsigned> active_lane;
8724   vperm.create (olanes);
8725   active_lane.safe_grow_cleared (children.length (), true);
8726   for (unsigned i = 0; i < ncopies; ++i)
8727     {
8728       for (unsigned pi = 0; pi < perm.length (); ++pi)
8729         {
8730           std::pair<unsigned, unsigned> p = perm[pi];
8731           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8732           if (repeating_p)
8733             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8734           else
8735             {
8736               /* We checked above that the vectors are constant-length.  */
8737               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8738               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8739               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8740               vperm.quick_push ({{p.first, vi}, vl});
8741             }
8742         }
8743       /* Advance to the next group.  */
8744       for (unsigned j = 0; j < children.length (); ++j)
8745         active_lane[j] += SLP_TREE_LANES (children[j]);
8746     }
8747
8748   if (dump_p)
8749     {
8750       dump_printf_loc (MSG_NOTE, vect_location,
8751                        "vectorizing permutation");
8752       for (unsigned i = 0; i < perm.length (); ++i)
8753         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8754       if (repeating_p)
8755         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8756       dump_printf (MSG_NOTE, "\n");
8757       dump_printf_loc (MSG_NOTE, vect_location, "as");
8758       for (unsigned i = 0; i < vperm.length (); ++i)
8759         {
8760           if (i != 0
8761               && (repeating_p
8762                   ? multiple_p (i, npatterns)
8763                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8764             dump_printf (MSG_NOTE, ",");
8765           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8766                        vperm[i].first.first, vperm[i].first.second,
8767                        vperm[i].second);
8768         }
8769       dump_printf (MSG_NOTE, "\n");
8770     }
8771
8772   /* We can only handle two-vector permutes, everything else should
8773      be lowered on the SLP level.  The following is closely inspired
8774      by vect_transform_slp_perm_load and is supposed to eventually
8775      replace it.
8776      ???   As intermediate step do code-gen in the SLP tree representation
8777      somehow?  */
8778   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8779   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8780   unsigned int index = 0;
8781   poly_uint64 mask_element;
8782   vec_perm_builder mask;
8783   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8784   unsigned int count = mask.encoded_nelts ();
8785   mask.quick_grow (count);
8786   vec_perm_indices indices;
8787   unsigned nperms = 0;
8788   for (unsigned i = 0; i < vperm.length (); ++i)
8789     {
8790       mask_element = vperm[i].second;
8791       if (first_vec.first == -1U
8792           || first_vec == vperm[i].first)
8793         first_vec = vperm[i].first;
8794       else if (second_vec.first == -1U
8795                || second_vec == vperm[i].first)
8796         {
8797           second_vec = vperm[i].first;
8798           mask_element += nunits;
8799         }
8800       else
8801         {
8802           if (dump_p)
8803             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8804                              "permutation requires at "
8805                              "least three vectors\n");
8806           gcc_assert (!gsi);
8807           return -1;
8808         }
8809
8810       mask[index++] = mask_element;
8811
8812       if (index == count)
8813         {
8814           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8815                               TYPE_VECTOR_SUBPARTS (op_vectype));
8816           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8817                              && constant_multiple_p (mask[0], nunits));
8818           machine_mode vmode = TYPE_MODE (vectype);
8819           machine_mode op_vmode = TYPE_MODE (op_vectype);
8820           unsigned HOST_WIDE_INT c;
8821           if ((!identity_p
8822                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8823               || (identity_p
8824                   && !known_le (nunits,
8825                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8826                   && (!constant_multiple_p (nunits,
8827                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8828                                             &c) || c != 2)))
8829             {
8830               if (dump_p)
8831                 {
8832                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8833                                    vect_location,
8834                                    "unsupported vect permute { ");
8835                   for (i = 0; i < count; ++i)
8836                     {
8837                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8838                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8839                     }
8840                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8841                 }
8842               gcc_assert (!gsi);
8843               return -1;
8844             }
8845
8846           if (!identity_p)
8847             nperms++;
8848           if (gsi)
8849             {
8850               if (second_vec.first == -1U)
8851                 second_vec = first_vec;
8852
8853               slp_tree
8854                 first_node = children[first_vec.first],
8855                 second_node = children[second_vec.first];
8856
8857               tree mask_vec = NULL_TREE;
8858               if (!identity_p)
8859                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8860
8861               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8862                 {
8863                   tree first_def
8864                     = vect_get_slp_vect_def (first_node,
8865                                              first_vec.second + vi);
8866                   tree second_def
8867                     = vect_get_slp_vect_def (second_node,
8868                                              second_vec.second + vi);
8869                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8870                                             second_def, mask_vec, mask[0]);
8871                 }
8872             }
8873
8874           index = 0;
8875           first_vec = std::make_pair (-1U, -1U);
8876           second_vec = std::make_pair (-1U, -1U);
8877         }
8878     }
8879
8880   return nperms;
8881 }
8882
8883 /* Vectorize the SLP permutations in NODE as specified
8884    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8885    child number and lane number.
8886    Interleaving of two two-lane two-child SLP subtrees (not supported):
8887      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8888    A blend of two four-lane two-child SLP subtrees:
8889      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8890    Highpart of a four-lane one-child SLP subtree (not supported):
8891      [ { 0, 2 }, { 0, 3 } ]
8892    Where currently only a subset is supported by code generating below.  */
8893
8894 static bool
8895 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8896                               slp_tree node, stmt_vector_for_cost *cost_vec)
8897 {
8898   tree vectype = SLP_TREE_VECTYPE (node);
8899   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8900   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8901                                                SLP_TREE_CHILDREN (node),
8902                                                dump_enabled_p ());
8903   if (nperms < 0)
8904     return false;
8905
8906   if (!gsi)
8907     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8908
8909   return true;
8910 }
8911
8912 /* Vectorize SLP NODE.  */
8913
8914 static void
8915 vect_schedule_slp_node (vec_info *vinfo,
8916                         slp_tree node, slp_instance instance)
8917 {
8918   gimple_stmt_iterator si;
8919   int i;
8920   slp_tree child;
8921
8922   /* For existing vectors there's nothing to do.  */
8923   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8924       && SLP_TREE_VEC_DEFS (node).exists ())
8925     return;
8926
8927   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8928
8929   /* Vectorize externals and constants.  */
8930   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8931       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8932     {
8933       /* ???  vectorizable_shift can end up using a scalar operand which is
8934          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8935          node in this case.  */
8936       if (!SLP_TREE_VECTYPE (node))
8937         return;
8938
8939       vect_create_constant_vectors (vinfo, node);
8940       return;
8941     }
8942
8943   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8944
8945   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8946   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8947
8948   if (dump_enabled_p ())
8949     dump_printf_loc (MSG_NOTE, vect_location,
8950                      "------>vectorizing SLP node starting from: %G",
8951                      stmt_info->stmt);
8952
8953   if (STMT_VINFO_DATA_REF (stmt_info)
8954       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8955     {
8956       /* Vectorized loads go before the first scalar load to make it
8957          ready early, vectorized stores go before the last scalar
8958          stmt which is where all uses are ready.  */
8959       stmt_vec_info last_stmt_info = NULL;
8960       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8961         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8962       else /* DR_IS_WRITE */
8963         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8964       si = gsi_for_stmt (last_stmt_info->stmt);
8965     }
8966   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8967             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8968             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8969            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8970     {
8971       /* For PHI node vectorization we do not use the insertion iterator.  */
8972       si = gsi_none ();
8973     }
8974   else
8975     {
8976       /* Emit other stmts after the children vectorized defs which is
8977          earliest possible.  */
8978       gimple *last_stmt = NULL;
8979       bool seen_vector_def = false;
8980       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8981         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8982           {
8983             /* For fold-left reductions we are retaining the scalar
8984                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8985                set so the representation isn't perfect.  Resort to the
8986                last scalar def here.  */
8987             if (SLP_TREE_VEC_DEFS (child).is_empty ())
8988               {
8989                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8990                             == cycle_phi_info_type);
8991                 gphi *phi = as_a <gphi *>
8992                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8993                 if (!last_stmt
8994                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8995                   last_stmt = phi;
8996               }
8997             /* We are emitting all vectorized stmts in the same place and
8998                the last one is the last.
8999                ???  Unless we have a load permutation applied and that
9000                figures to re-use an earlier generated load.  */
9001             unsigned j;
9002             tree vdef;
9003             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9004               {
9005                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9006                 if (!last_stmt
9007                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9008                   last_stmt = vstmt;
9009               }
9010           }
9011         else if (!SLP_TREE_VECTYPE (child))
9012           {
9013             /* For externals we use unvectorized at all scalar defs.  */
9014             unsigned j;
9015             tree def;
9016             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9017               if (TREE_CODE (def) == SSA_NAME
9018                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9019                 {
9020                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9021                   if (!last_stmt
9022                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9023                     last_stmt = stmt;
9024                 }
9025           }
9026         else
9027           {
9028             /* For externals we have to look at all defs since their
9029                insertion place is decided per vector.  But beware
9030                of pre-existing vectors where we need to make sure
9031                we do not insert before the region boundary.  */
9032             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9033                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9034               seen_vector_def = true;
9035             else
9036               {
9037                 unsigned j;
9038                 tree vdef;
9039                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9040                   if (TREE_CODE (vdef) == SSA_NAME
9041                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9042                     {
9043                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9044                       if (!last_stmt
9045                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9046                         last_stmt = vstmt;
9047                     }
9048               }
9049           }
9050       /* This can happen when all children are pre-existing vectors or
9051          constants.  */
9052       if (!last_stmt)
9053         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9054       if (!last_stmt)
9055         {
9056           gcc_assert (seen_vector_def);
9057           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9058         }
9059       else if (is_ctrl_altering_stmt (last_stmt))
9060         {
9061           /* We split regions to vectorize at control altering stmts
9062              with a definition so this must be an external which
9063              we can insert at the start of the region.  */
9064           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9065         }
9066       else if (is_a <bb_vec_info> (vinfo)
9067                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9068                && gimple_could_trap_p (stmt_info->stmt))
9069         {
9070           /* We've constrained possibly trapping operations to all come
9071              from the same basic-block, if vectorized defs would allow earlier
9072              scheduling still force vectorized stmts to the original block.
9073              This is only necessary for BB vectorization since for loop vect
9074              all operations are in a single BB and scalar stmt based
9075              placement doesn't play well with epilogue vectorization.  */
9076           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9077                                       gimple_bb (stmt_info->stmt),
9078                                       gimple_bb (last_stmt)));
9079           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9080         }
9081       else if (is_a <gphi *> (last_stmt))
9082         si = gsi_after_labels (gimple_bb (last_stmt));
9083       else
9084         {
9085           si = gsi_for_stmt (last_stmt);
9086           gsi_next (&si);
9087         }
9088     }
9089
9090   /* Handle purely internal nodes.  */
9091   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9092     {
9093       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9094          be shared with different SLP nodes (but usually it's the same
9095          operation apart from the case the stmt is only there for denoting
9096          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9097          but open-code it here (partly).  */
9098       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9099       gcc_assert (done);
9100       stmt_vec_info slp_stmt_info;
9101       unsigned int i;
9102       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9103         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9104           {
9105             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9106                                                 instance, i, true, NULL);
9107             gcc_assert (done);
9108           }
9109     }
9110   else
9111     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9112 }
9113
9114 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9115    For loop vectorization this is done in vectorizable_call, but for SLP
9116    it needs to be deferred until end of vect_schedule_slp, because multiple
9117    SLP instances may refer to the same scalar stmt.  */
9118
9119 static void
9120 vect_remove_slp_scalar_calls (vec_info *vinfo,
9121                               slp_tree node, hash_set<slp_tree> &visited)
9122 {
9123   gimple *new_stmt;
9124   gimple_stmt_iterator gsi;
9125   int i;
9126   slp_tree child;
9127   tree lhs;
9128   stmt_vec_info stmt_info;
9129
9130   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9131     return;
9132
9133   if (visited.add (node))
9134     return;
9135
9136   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9137     vect_remove_slp_scalar_calls (vinfo, child, visited);
9138
9139   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9140     {
9141       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9142       if (!stmt || gimple_bb (stmt) == NULL)
9143         continue;
9144       if (is_pattern_stmt_p (stmt_info)
9145           || !PURE_SLP_STMT (stmt_info))
9146         continue;
9147       lhs = gimple_call_lhs (stmt);
9148       if (lhs)
9149         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9150       else
9151         {
9152           new_stmt = gimple_build_nop ();
9153           unlink_stmt_vdef (stmt_info->stmt);
9154         }
9155       gsi = gsi_for_stmt (stmt);
9156       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9157       if (lhs)
9158         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9159     }
9160 }
9161
9162 static void
9163 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9164 {
9165   hash_set<slp_tree> visited;
9166   vect_remove_slp_scalar_calls (vinfo, node, visited);
9167 }
9168
9169 /* Vectorize the instance root.  */
9170
9171 void
9172 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9173 {
9174   gassign *rstmt = NULL;
9175
9176   if (instance->kind == slp_inst_kind_ctor)
9177     {
9178       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9179         {
9180           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9181           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9182           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9183                                           TREE_TYPE (vect_lhs)))
9184             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9185                                vect_lhs);
9186           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9187         }
9188       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9189         {
9190           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9191           tree child_def;
9192           int j;
9193           vec<constructor_elt, va_gc> *v;
9194           vec_alloc (v, nelts);
9195
9196           /* A CTOR can handle V16HI composition from VNx8HI so we
9197              do not need to convert vector elements if the types
9198              do not match.  */
9199           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9200             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9201           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9202           tree rtype
9203             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9204           tree r_constructor = build_constructor (rtype, v);
9205           rstmt = gimple_build_assign (lhs, r_constructor);
9206         }
9207     }
9208   else if (instance->kind == slp_inst_kind_bb_reduc)
9209     {
9210       /* Largely inspired by reduction chain epilogue handling in
9211          vect_create_epilog_for_reduction.  */
9212       vec<tree> vec_defs = vNULL;
9213       vect_get_slp_defs (node, &vec_defs);
9214       enum tree_code reduc_code
9215         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9216       /* ???  We actually have to reflect signs somewhere.  */
9217       if (reduc_code == MINUS_EXPR)
9218         reduc_code = PLUS_EXPR;
9219       gimple_seq epilogue = NULL;
9220       /* We may end up with more than one vector result, reduce them
9221          to one vector.  */
9222       tree vec_def = vec_defs[0];
9223       tree vectype = TREE_TYPE (vec_def);
9224       tree compute_vectype = vectype;
9225       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9226                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9227                                  && operation_can_overflow (reduc_code));
9228       if (pun_for_overflow_p)
9229         {
9230           compute_vectype = unsigned_type_for (vectype);
9231           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9232                                   compute_vectype, vec_def);
9233         }
9234       for (unsigned i = 1; i < vec_defs.length (); ++i)
9235         {
9236           tree def = vec_defs[i];
9237           if (pun_for_overflow_p)
9238             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9239                                 compute_vectype, def);
9240           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9241                                   vec_def, def);
9242         }
9243       vec_defs.release ();
9244       /* ???  Support other schemes than direct internal fn.  */
9245       internal_fn reduc_fn;
9246       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9247           || reduc_fn == IFN_LAST)
9248         gcc_unreachable ();
9249       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9250                                       TREE_TYPE (compute_vectype), vec_def);
9251       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9252         {
9253           tree rem_def = NULL_TREE;
9254           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9255             {
9256               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9257               if (!rem_def)
9258                 rem_def = def;
9259               else
9260                 rem_def = gimple_build (&epilogue, reduc_code,
9261                                         TREE_TYPE (scalar_def),
9262                                         rem_def, def);
9263             }
9264           scalar_def = gimple_build (&epilogue, reduc_code,
9265                                      TREE_TYPE (scalar_def),
9266                                      scalar_def, rem_def);
9267         }
9268       scalar_def = gimple_convert (&epilogue,
9269                                    TREE_TYPE (vectype), scalar_def);
9270       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9271       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9272       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9273       update_stmt (gsi_stmt (rgsi));
9274       return;
9275     }
9276   else
9277     gcc_unreachable ();
9278
9279   gcc_assert (rstmt);
9280
9281   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9282   gsi_replace (&rgsi, rstmt, true);
9283 }
9284
9285 struct slp_scc_info
9286 {
9287   bool on_stack;
9288   int dfs;
9289   int lowlink;
9290 };
9291
9292 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9293
9294 static void
9295 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9296                    hash_map<slp_tree, slp_scc_info> &scc_info,
9297                    int &maxdfs, vec<slp_tree> &stack)
9298 {
9299   bool existed_p;
9300   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9301   gcc_assert (!existed_p);
9302   info->dfs = maxdfs;
9303   info->lowlink = maxdfs;
9304   maxdfs++;
9305
9306   /* Leaf.  */
9307   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9308     {
9309       info->on_stack = false;
9310       vect_schedule_slp_node (vinfo, node, instance);
9311       return;
9312     }
9313
9314   info->on_stack = true;
9315   stack.safe_push (node);
9316
9317   unsigned i;
9318   slp_tree child;
9319   /* DFS recurse.  */
9320   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9321     {
9322       if (!child)
9323         continue;
9324       slp_scc_info *child_info = scc_info.get (child);
9325       if (!child_info)
9326         {
9327           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9328           /* Recursion might have re-allocated the node.  */
9329           info = scc_info.get (node);
9330           child_info = scc_info.get (child);
9331           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9332         }
9333       else if (child_info->on_stack)
9334         info->lowlink = MIN (info->lowlink, child_info->dfs);
9335     }
9336   if (info->lowlink != info->dfs)
9337     return;
9338
9339   auto_vec<slp_tree, 4> phis_to_fixup;
9340
9341   /* Singleton.  */
9342   if (stack.last () == node)
9343     {
9344       stack.pop ();
9345       info->on_stack = false;
9346       vect_schedule_slp_node (vinfo, node, instance);
9347       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9348           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9349         phis_to_fixup.quick_push (node);
9350     }
9351   else
9352     {
9353       /* SCC.  */
9354       int last_idx = stack.length () - 1;
9355       while (stack[last_idx] != node)
9356         last_idx--;
9357       /* We can break the cycle at PHIs who have at least one child
9358          code generated.  Then we could re-start the DFS walk until
9359          all nodes in the SCC are covered (we might have new entries
9360          for only back-reachable nodes).  But it's simpler to just
9361          iterate and schedule those that are ready.  */
9362       unsigned todo = stack.length () - last_idx;
9363       do
9364         {
9365           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9366             {
9367               slp_tree entry = stack[idx];
9368               if (!entry)
9369                 continue;
9370               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9371                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9372               bool ready = !phi;
9373               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9374                   if (!child)
9375                     {
9376                       gcc_assert (phi);
9377                       ready = true;
9378                       break;
9379                     }
9380                   else if (scc_info.get (child)->on_stack)
9381                     {
9382                       if (!phi)
9383                         {
9384                           ready = false;
9385                           break;
9386                         }
9387                     }
9388                   else
9389                     {
9390                       if (phi)
9391                         {
9392                           ready = true;
9393                           break;
9394                         }
9395                     }
9396               if (ready)
9397                 {
9398                   vect_schedule_slp_node (vinfo, entry, instance);
9399                   scc_info.get (entry)->on_stack = false;
9400                   stack[idx] = NULL;
9401                   todo--;
9402                   if (phi)
9403                     phis_to_fixup.safe_push (entry);
9404                 }
9405             }
9406         }
9407       while (todo != 0);
9408
9409       /* Pop the SCC.  */
9410       stack.truncate (last_idx);
9411     }
9412
9413   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9414   slp_tree phi_node;
9415   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9416     {
9417       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9418       edge_iterator ei;
9419       edge e;
9420       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9421         {
9422           unsigned dest_idx = e->dest_idx;
9423           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9424           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9425             continue;
9426           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9427           /* Simply fill all args.  */
9428           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9429               != vect_first_order_recurrence)
9430             for (unsigned i = 0; i < n; ++i)
9431               {
9432                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9433                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9434                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9435                              e, gimple_phi_arg_location (phi, dest_idx));
9436               }
9437           else
9438             {
9439               /* Unless it is a first order recurrence which needs
9440                  args filled in for both the PHI node and the permutes.  */
9441               gimple *perm
9442                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9443               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9444               add_phi_arg (as_a <gphi *> (rphi),
9445                            vect_get_slp_vect_def (child, n - 1),
9446                            e, gimple_phi_arg_location (phi, dest_idx));
9447               for (unsigned i = 0; i < n; ++i)
9448                 {
9449                   gimple *perm
9450                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9451                   if (i > 0)
9452                     gimple_assign_set_rhs1 (perm,
9453                                             vect_get_slp_vect_def (child, i - 1));
9454                   gimple_assign_set_rhs2 (perm,
9455                                           vect_get_slp_vect_def (child, i));
9456                   update_stmt (perm);
9457                 }
9458             }
9459         }
9460     }
9461 }
9462
9463 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9464
9465 void
9466 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9467 {
9468   slp_instance instance;
9469   unsigned int i;
9470
9471   hash_map<slp_tree, slp_scc_info> scc_info;
9472   int maxdfs = 0;
9473   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9474     {
9475       slp_tree node = SLP_INSTANCE_TREE (instance);
9476       if (dump_enabled_p ())
9477         {
9478           dump_printf_loc (MSG_NOTE, vect_location,
9479                            "Vectorizing SLP tree:\n");
9480           /* ???  Dump all?  */
9481           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9482             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9483                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9484           vect_print_slp_graph (MSG_NOTE, vect_location,
9485                                 SLP_INSTANCE_TREE (instance));
9486         }
9487       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9488          have a PHI be the node breaking the cycle.  */
9489       auto_vec<slp_tree> stack;
9490       if (!scc_info.get (node))
9491         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9492
9493       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9494         vectorize_slp_instance_root_stmt (node, instance);
9495
9496       if (dump_enabled_p ())
9497         dump_printf_loc (MSG_NOTE, vect_location,
9498                          "vectorizing stmts using SLP.\n");
9499     }
9500
9501   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9502     {
9503       slp_tree root = SLP_INSTANCE_TREE (instance);
9504       stmt_vec_info store_info;
9505       unsigned int j;
9506
9507       /* Remove scalar call stmts.  Do not do this for basic-block
9508          vectorization as not all uses may be vectorized.
9509          ???  Why should this be necessary?  DCE should be able to
9510          remove the stmts itself.
9511          ???  For BB vectorization we can as well remove scalar
9512          stmts starting from the SLP tree root if they have no
9513          uses.  */
9514       if (is_a <loop_vec_info> (vinfo))
9515         vect_remove_slp_scalar_calls (vinfo, root);
9516
9517       /* Remove vectorized stores original scalar stmts.  */
9518       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9519         {
9520           if (!STMT_VINFO_DATA_REF (store_info)
9521               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9522             break;
9523
9524           store_info = vect_orig_stmt (store_info);
9525           /* Free the attached stmt_vec_info and remove the stmt.  */
9526           vinfo->remove_stmt (store_info);
9527
9528           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9529              to not crash in vect_free_slp_tree later.  */
9530           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9531             SLP_TREE_REPRESENTATIVE (root) = NULL;
9532         }
9533     }
9534 }