gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_STMTS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_STMTS (this).release ();
 145   SLP_TREE_VEC_DEFS (this).release ();
 146   SLP_TREE_LOAD_PERMUTATION (this).release ();
 147   SLP_TREE_LANE_PERMUTATION (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 153
 154 void
 155 vect_free_slp_tree (slp_tree node)
 156 {
 157   int i;
 158   slp_tree child;
 159
 160   if (--SLP_TREE_REF_COUNT (node) != 0)
 161     return;
 162
 163   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 164     if (child)
 165       vect_free_slp_tree (child);
 166
 167   /* If the node defines any SLP only patterns then those patterns are no
 168      longer valid and should be removed.  */
 169   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 170   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 171     {
 172       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 173       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 174       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 175     }
 176
 177   delete node;
 178 }
 179
 180 /* Return a location suitable for dumpings related to the SLP instance.  */
 181
 182 dump_user_location_t
 183 _slp_instance::location () const
 184 {
 185   if (!root_stmts.is_empty ())
 186     return root_stmts[0]->stmt;
 187   else
 188     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 189 }
 190
 191
 192 /* Free the memory allocated for the SLP instance.  */
 193
 194 void
 195 vect_free_slp_instance (slp_instance instance)
 196 {
 197   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 198   SLP_INSTANCE_LOADS (instance).release ();
 199   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 200   instance->subgraph_entries.release ();
 201   instance->cost_vec.release ();
 202   free (instance);
 203 }
 204
 205
 206 /* Create an SLP node for SCALAR_STMTS.  */
 207
 208 slp_tree
 209 vect_create_new_slp_node (unsigned nops, tree_code code)
 210 {
 211   slp_tree node = new _slp_tree;
 212   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 213   SLP_TREE_CHILDREN (node).create (nops);
 214   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 215   SLP_TREE_CODE (node) = code;
 216   return node;
 217 }
 218 /* Create an SLP node for SCALAR_STMTS.  */
 219
 220 static slp_tree
 221 vect_create_new_slp_node (slp_tree node,
 222                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 223 {
 224   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 225   SLP_TREE_CHILDREN (node).create (nops);
 226   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 227   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 228   SLP_TREE_LANES (node) = scalar_stmts.length ();
 229   return node;
 230 }
 231
 232 /* Create an SLP node for SCALAR_STMTS.  */
 233
 234 static slp_tree
 235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 238 }
 239
 240 /* Create an SLP node for OPS.  */
 241
 242 static slp_tree
 243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 244 {
 245   SLP_TREE_SCALAR_OPS (node) = ops;
 246   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 247   SLP_TREE_LANES (node) = ops.length ();
 248   return node;
 249 }
 250
 251 /* Create an SLP node for OPS.  */
 252
 253 static slp_tree
 254 vect_create_new_slp_node (vec<tree> ops)
 255 {
 256   return vect_create_new_slp_node (new _slp_tree, ops);
 257 }
 258
 259
 260 /* This structure is used in creation of an SLP tree.  Each instance
 261    corresponds to the same operand in a group of scalar stmts in an SLP
 262    node.  */
 263 typedef struct _slp_oprnd_info
 264 {
 265   /* Def-stmts for the operands.  */
 266   vec<stmt_vec_info> def_stmts;
 267   /* Operands.  */
 268   vec<tree> ops;
 269   /* Information about the first statement, its vector def-type, type, the
 270      operand itself in case it's constant, and an indication if it's a pattern
 271      stmt.  */
 272   tree first_op_type;
 273   enum vect_def_type first_dt;
 274   bool any_pattern;
 275 } *slp_oprnd_info;
 276
 277
 278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 279    operand.  */
 280 static vec<slp_oprnd_info>
 281 vect_create_oprnd_info (int nops, int group_size)
 282 {
 283   int i;
 284   slp_oprnd_info oprnd_info;
 285   vec<slp_oprnd_info> oprnds_info;
 286
 287   oprnds_info.create (nops);
 288   for (i = 0; i < nops; i++)
 289     {
 290       oprnd_info = XNEW (struct _slp_oprnd_info);
 291       oprnd_info->def_stmts.create (group_size);
 292       oprnd_info->ops.create (group_size);
 293       oprnd_info->first_dt = vect_uninitialized_def;
 294       oprnd_info->first_op_type = NULL_TREE;
 295       oprnd_info->any_pattern = false;
 296       oprnds_info.quick_push (oprnd_info);
 297     }
 298
 299   return oprnds_info;
 300 }
 301
 302
 303 /* Free operands info.  */
 304
 305 static void
 306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 307 {
 308   int i;
 309   slp_oprnd_info oprnd_info;
 310
 311   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 312     {
 313       oprnd_info->def_stmts.release ();
 314       oprnd_info->ops.release ();
 315       XDELETE (oprnd_info);
 316     }
 317
 318   oprnds_info.release ();
 319 }
 320
 321 /* Return the execution frequency of NODE (so that a higher value indicates
 322    a "more important" node when optimizing for speed).  */
 323
 324 static sreal
 325 vect_slp_node_weight (slp_tree node)
 326 {
 327   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 328   basic_block bb = gimple_bb (stmt_info->stmt);
 329   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 330 }
 331
 332 /* Return true if STMTS contains a pattern statement.  */
 333
 334 static bool
 335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 336 {
 337   stmt_vec_info stmt_info;
 338   unsigned int i;
 339   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 340     if (is_pattern_stmt_p (stmt_info))
 341       return true;
 342   return false;
 343 }
 344
 345 /* Return true when all lanes in the external or constant NODE have
 346    the same value.  */
 347
 348 static bool
 349 vect_slp_tree_uniform_p (slp_tree node)
 350 {
 351   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 352               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 353
 354   /* Pre-exsting vectors.  */
 355   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 356     return false;
 357
 358   unsigned i;
 359   tree op, first = NULL_TREE;
 360   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 361     if (!first)
 362       first = op;
 363     else if (!operand_equal_p (first, op, 0))
 364       return false;
 365
 366   return true;
 367 }
 368
 369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 370    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 371    of the chain.  */
 372
 373 int
 374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 375                                       stmt_vec_info first_stmt_info)
 376 {
 377   stmt_vec_info next_stmt_info = first_stmt_info;
 378   int result = 0;
 379
 380   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 381     return -1;
 382
 383   do
 384     {
 385       if (next_stmt_info == stmt_info)
 386         return result;
 387       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 388       if (next_stmt_info)
 389         result += DR_GROUP_GAP (next_stmt_info);
 390     }
 391   while (next_stmt_info);
 392
 393   return -1;
 394 }
 395
 396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 397    using the method implemented by duplicate_and_interleave.  Return true
 398    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 399    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 400    (if nonnull).  */
 401
 402 bool
 403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 404                                 tree elt_type, unsigned int *nvectors_out,
 405                                 tree *vector_type_out,
 406                                 tree *permutes)
 407 {
 408   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 409   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 410     return false;
 411
 412   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 413   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 414   unsigned int nvectors = 1;
 415   for (;;)
 416     {
 417       scalar_int_mode int_mode;
 418       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 419       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 420         {
 421           /* Get the natural vector type for this SLP group size.  */
 422           tree int_type = build_nonstandard_integer_type
 423             (GET_MODE_BITSIZE (int_mode), 1);
 424           tree vector_type
 425             = get_vectype_for_scalar_type (vinfo, int_type, count);
 426           if (vector_type
 427               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 428               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 429                            GET_MODE_SIZE (base_vector_mode)))
 430             {
 431               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 432                  together into elements of type INT_TYPE and using the result
 433                  to build NVECTORS vectors.  */
 434               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 435               vec_perm_builder sel1 (nelts, 2, 3);
 436               vec_perm_builder sel2 (nelts, 2, 3);
 437               poly_int64 half_nelts = exact_div (nelts, 2);
 438               for (unsigned int i = 0; i < 3; ++i)
 439                 {
 440                   sel1.quick_push (i);
 441                   sel1.quick_push (i + nelts);
 442                   sel2.quick_push (half_nelts + i);
 443                   sel2.quick_push (half_nelts + i + nelts);
 444                 }
 445               vec_perm_indices indices1 (sel1, 2, nelts);
 446               vec_perm_indices indices2 (sel2, 2, nelts);
 447               machine_mode vmode = TYPE_MODE (vector_type);
 448               if (can_vec_perm_const_p (vmode, vmode, indices1)
 449                   && can_vec_perm_const_p (vmode, vmode, indices2))
 450                 {
 451                   if (nvectors_out)
 452                     *nvectors_out = nvectors;
 453                   if (vector_type_out)
 454                     *vector_type_out = vector_type;
 455                   if (permutes)
 456                     {
 457                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 458                                                                 indices1);
 459                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 460                                                                 indices2);
 461                     }
 462                   return true;
 463                 }
 464             }
 465         }
 466       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 467         return false;
 468       nvectors *= 2;
 469     }
 470 }
 471
 472 /* Return true if DTA and DTB match.  */
 473
 474 static bool
 475 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 476 {
 477   return (dta == dtb
 478           || ((dta == vect_external_def || dta == vect_constant_def)
 479               && (dtb == vect_external_def || dtb == vect_constant_def)));
 480 }
 481
 482 static const int cond_expr_maps[3][5] = {
 483   { 4, -1, -2, 1, 2 },
 484   { 4, -2, -1, 1, 2 },
 485   { 4, -1, -2, 2, 1 }
 486 };
 487 static const int arg1_map[] = { 1, 1 };
 488 static const int arg2_map[] = { 1, 2 };
 489 static const int arg1_arg4_map[] = { 2, 1, 4 };
 490 static const int op1_op0_map[] = { 2, 1, 0 };
 491
 492 /* For most SLP statements, there is a one-to-one mapping between
 493    gimple arguments and child nodes.  If that is not true for STMT,
 494    return an array that contains:
 495
 496    - the number of child nodes, followed by
 497    - for each child node, the index of the argument associated with that node.
 498      The special index -1 is the first operand of an embedded comparison and
 499      the special index -2 is the second operand of an embedded comparison.
 500
 501    SWAP is as for vect_get_and_check_slp_defs.  */
 502
 503 static const int *
 504 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 505 {
 506   if (auto assign = dyn_cast<const gassign *> (stmt))
 507     {
 508       if (gimple_assign_rhs_code (assign) == COND_EXPR
 509           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 510         return cond_expr_maps[swap];
 511       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 512           && swap)
 513         return op1_op0_map;
 514     }
 515   gcc_assert (!swap);
 516   if (auto call = dyn_cast<const gcall *> (stmt))
 517     {
 518       if (gimple_call_internal_p (call))
 519         switch (gimple_call_internal_fn (call))
 520           {
 521           case IFN_MASK_LOAD:
 522             return arg2_map;
 523
 524           case IFN_GATHER_LOAD:
 525             return arg1_map;
 526
 527           case IFN_MASK_GATHER_LOAD:
 528             return arg1_arg4_map;
 529
 530           default:
 531             break;
 532           }
 533     }
 534   return nullptr;
 535 }
 536
 537 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 538    they are of a valid type and that they match the defs of the first stmt of
 539    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 540    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 541    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 542    is 1 if STMT is cond and operands of comparison need to be swapped;
 543    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 544
 545    If there was a fatal error return -1; if the error could be corrected by
 546    swapping operands of father node of this one, return 1; if everything is
 547    ok return 0.  */
 548 static int
 549 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 550                              bool *skip_args,
 551                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 552                              vec<slp_oprnd_info> *oprnds_info)
 553 {
 554   stmt_vec_info stmt_info = stmts[stmt_num];
 555   tree oprnd;
 556   unsigned int i, number_of_oprnds;
 557   enum vect_def_type dt = vect_uninitialized_def;
 558   slp_oprnd_info oprnd_info;
 559   unsigned int commutative_op = -1U;
 560   bool first = stmt_num == 0;
 561
 562   if (!is_a<gcall *> (stmt_info->stmt)
 563       && !is_a<gassign *> (stmt_info->stmt)
 564       && !is_a<gphi *> (stmt_info->stmt))
 565     return -1;
 566
 567   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 568   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 569   if (map)
 570     number_of_oprnds = *map++;
 571   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 572     {
 573       if (gimple_call_internal_p (stmt))
 574         {
 575           internal_fn ifn = gimple_call_internal_fn (stmt);
 576           commutative_op = first_commutative_argument (ifn);
 577         }
 578     }
 579   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 580     {
 581       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 582         commutative_op = 0;
 583     }
 584
 585   bool swapped = (swap != 0);
 586   bool backedge = false;
 587   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 588   for (i = 0; i < number_of_oprnds; i++)
 589     {
 590       int opno = map ? map[i] : int (i);
 591       if (opno < 0)
 592         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 593       else
 594         {
 595           oprnd = gimple_arg (stmt_info->stmt, opno);
 596           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 597             backedge = dominated_by_p (CDI_DOMINATORS,
 598                                        gimple_phi_arg_edge (stmt, opno)->src,
 599                                        gimple_bb (stmt_info->stmt));
 600         }
 601       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 602         oprnd = TREE_OPERAND (oprnd, 0);
 603
 604       oprnd_info = (*oprnds_info)[i];
 605
 606       stmt_vec_info def_stmt_info;
 607       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 608         {
 609           if (dump_enabled_p ())
 610             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 611                              "Build SLP failed: can't analyze def for %T\n",
 612                              oprnd);
 613
 614           return -1;
 615         }
 616
 617       if (skip_args[i])
 618         {
 619           oprnd_info->def_stmts.quick_push (NULL);
 620           oprnd_info->ops.quick_push (NULL_TREE);
 621           oprnd_info->first_dt = vect_uninitialized_def;
 622           continue;
 623         }
 624
 625       oprnd_info->def_stmts.quick_push (def_stmt_info);
 626       oprnd_info->ops.quick_push (oprnd);
 627
 628       if (def_stmt_info
 629           && is_pattern_stmt_p (def_stmt_info))
 630         {
 631           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 632               != def_stmt_info)
 633             oprnd_info->any_pattern = true;
 634           else
 635             /* If we promote this to external use the original stmt def.  */
 636             oprnd_info->ops.last ()
 637               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 638         }
 639
 640       /* If there's a extern def on a backedge make sure we can
 641          code-generate at the region start.
 642          ???  This is another case that could be fixed by adjusting
 643          how we split the function but at the moment we'd have conflicting
 644          goals there.  */
 645       if (backedge
 646           && dts[i] == vect_external_def
 647           && is_a <bb_vec_info> (vinfo)
 648           && TREE_CODE (oprnd) == SSA_NAME
 649           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 650           && !dominated_by_p (CDI_DOMINATORS,
 651                               as_a <bb_vec_info> (vinfo)->bbs[0],
 652                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 653         {
 654           if (dump_enabled_p ())
 655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 656                              "Build SLP failed: extern def %T only defined "
 657                              "on backedge\n", oprnd);
 658           return -1;
 659         }
 660
 661       if (first)
 662         {
 663           tree type = TREE_TYPE (oprnd);
 664           dt = dts[i];
 665           if ((dt == vect_constant_def
 666                || dt == vect_external_def)
 667               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 668               && (TREE_CODE (type) == BOOLEAN_TYPE
 669                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 670                                                       type)))
 671             {
 672               if (dump_enabled_p ())
 673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 674                                  "Build SLP failed: invalid type of def "
 675                                  "for variable-length SLP %T\n", oprnd);
 676               return -1;
 677             }
 678
 679           /* For the swapping logic below force vect_reduction_def
 680              for the reduction op in a SLP reduction group.  */
 681           if (!STMT_VINFO_DATA_REF (stmt_info)
 682               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 683               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 684               && def_stmt_info)
 685             dts[i] = dt = vect_reduction_def;
 686
 687           /* Check the types of the definition.  */
 688           switch (dt)
 689             {
 690             case vect_external_def:
 691             case vect_constant_def:
 692             case vect_internal_def:
 693             case vect_reduction_def:
 694             case vect_induction_def:
 695             case vect_nested_cycle:
 696               break;
 697
 698             default:
 699               /* FORNOW: Not supported.  */
 700               if (dump_enabled_p ())
 701                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 702                                  "Build SLP failed: illegal type of def %T\n",
 703                                  oprnd);
 704               return -1;
 705             }
 706
 707           oprnd_info->first_dt = dt;
 708           oprnd_info->first_op_type = type;
 709         }
 710     }
 711   if (first)
 712     return 0;
 713
 714   /* Now match the operand definition types to that of the first stmt.  */
 715   for (i = 0; i < number_of_oprnds;)
 716     {
 717       if (skip_args[i])
 718         {
 719           ++i;
 720           continue;
 721         }
 722
 723       oprnd_info = (*oprnds_info)[i];
 724       dt = dts[i];
 725       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 726       oprnd = oprnd_info->ops[stmt_num];
 727       tree type = TREE_TYPE (oprnd);
 728
 729       if (!types_compatible_p (oprnd_info->first_op_type, type))
 730         {
 731           if (dump_enabled_p ())
 732             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 733                              "Build SLP failed: different operand types\n");
 734           return 1;
 735         }
 736
 737       /* Not first stmt of the group, check that the def-stmt/s match
 738          the def-stmt/s of the first stmt.  Allow different definition
 739          types for reduction chains: the first stmt must be a
 740          vect_reduction_def (a phi node), and the rest
 741          end in the reduction chain.  */
 742       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 743            && !(oprnd_info->first_dt == vect_reduction_def
 744                 && !STMT_VINFO_DATA_REF (stmt_info)
 745                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 746                 && def_stmt_info
 747                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 748                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 749                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 750           || (!STMT_VINFO_DATA_REF (stmt_info)
 751               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 752               && ((!def_stmt_info
 753                    || STMT_VINFO_DATA_REF (def_stmt_info)
 754                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 755                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 756                   != (oprnd_info->first_dt != vect_reduction_def))))
 757         {
 758           /* Try swapping operands if we got a mismatch.  For BB
 759              vectorization only in case it will clearly improve things.  */
 760           if (i == commutative_op && !swapped
 761               && (!is_a <bb_vec_info> (vinfo)
 762                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 763                                              dts[i+1])
 764                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 765                           || vect_def_types_match
 766                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 767             {
 768               if (dump_enabled_p ())
 769                 dump_printf_loc (MSG_NOTE, vect_location,
 770                                  "trying swapped operands\n");
 771               std::swap (dts[i], dts[i+1]);
 772               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 773                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 774               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 775                          (*oprnds_info)[i+1]->ops[stmt_num]);
 776               swapped = true;
 777               continue;
 778             }
 779
 780           if (is_a <bb_vec_info> (vinfo)
 781               && !oprnd_info->any_pattern)
 782             {
 783               /* Now for commutative ops we should see whether we can
 784                  make the other operand matching.  */
 785               if (dump_enabled_p ())
 786                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 787                                  "treating operand as external\n");
 788               oprnd_info->first_dt = dt = vect_external_def;
 789             }
 790           else
 791             {
 792               if (dump_enabled_p ())
 793                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 794                                  "Build SLP failed: different types\n");
 795               return 1;
 796             }
 797         }
 798
 799       /* Make sure to demote the overall operand to external.  */
 800       if (dt == vect_external_def)
 801         oprnd_info->first_dt = vect_external_def;
 802       /* For a SLP reduction chain we want to duplicate the reduction to
 803          each of the chain members.  That gets us a sane SLP graph (still
 804          the stmts are not 100% correct wrt the initial values).  */
 805       else if ((dt == vect_internal_def
 806                 || dt == vect_reduction_def)
 807                && oprnd_info->first_dt == vect_reduction_def
 808                && !STMT_VINFO_DATA_REF (stmt_info)
 809                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 810                && !STMT_VINFO_DATA_REF (def_stmt_info)
 811                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 812                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 813         {
 814           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 815           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 816         }
 817
 818       ++i;
 819     }
 820
 821   /* Swap operands.  */
 822   if (swapped)
 823     {
 824       if (dump_enabled_p ())
 825         dump_printf_loc (MSG_NOTE, vect_location,
 826                          "swapped operands to match def types in %G",
 827                          stmt_info->stmt);
 828     }
 829
 830   return 0;
 831 }
 832
 833 /* Return true if call statements CALL1 and CALL2 are similar enough
 834    to be combined into the same SLP group.  */
 835
 836 bool
 837 compatible_calls_p (gcall *call1, gcall *call2)
 838 {
 839   unsigned int nargs = gimple_call_num_args (call1);
 840   if (nargs != gimple_call_num_args (call2))
 841     return false;
 842
 843   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 844     return false;
 845
 846   if (gimple_call_internal_p (call1))
 847     {
 848       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 849                                TREE_TYPE (gimple_call_lhs (call2))))
 850         return false;
 851       for (unsigned int i = 0; i < nargs; ++i)
 852         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 853                                  TREE_TYPE (gimple_call_arg (call2, i))))
 854           return false;
 855     }
 856   else
 857     {
 858       if (!operand_equal_p (gimple_call_fn (call1),
 859                             gimple_call_fn (call2), 0))
 860         return false;
 861
 862       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 863         return false;
 864     }
 865
 866   /* Check that any unvectorized arguments are equal.  */
 867   if (const int *map = vect_get_operand_map (call1))
 868     {
 869       unsigned int nkept = *map++;
 870       unsigned int mapi = 0;
 871       for (unsigned int i = 0; i < nargs; ++i)
 872         if (mapi < nkept && map[mapi] == int (i))
 873           mapi += 1;
 874         else if (!operand_equal_p (gimple_call_arg (call1, i),
 875                                    gimple_call_arg (call2, i)))
 876           return false;
 877     }
 878
 879   return true;
 880 }
 881
 882 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 883    caller's attempt to find the vector type in STMT_INFO with the narrowest
 884    element type.  Return true if VECTYPE is nonnull and if it is valid
 885    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 886    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 887    vect_build_slp_tree.  */
 888
 889 static bool
 890 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 891                         unsigned int group_size,
 892                         tree vectype, poly_uint64 *max_nunits)
 893 {
 894   if (!vectype)
 895     {
 896       if (dump_enabled_p ())
 897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                          "Build SLP failed: unsupported data-type in %G\n",
 899                          stmt_info->stmt);
 900       /* Fatal mismatch.  */
 901       return false;
 902     }
 903
 904   /* If populating the vector type requires unrolling then fail
 905      before adjusting *max_nunits for basic-block vectorization.  */
 906   if (is_a <bb_vec_info> (vinfo)
 907       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 908     {
 909       if (dump_enabled_p ())
 910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                          "Build SLP failed: unrolling required "
 912                          "in basic block SLP\n");
 913       /* Fatal mismatch.  */
 914       return false;
 915     }
 916
 917   /* In case of multiple types we need to detect the smallest type.  */
 918   vect_update_max_nunits (max_nunits, vectype);
 919   return true;
 920 }
 921
 922 /* Verify if the scalar stmts STMTS are isomorphic, require data
 923    permutation or are of unsupported types of operation.  Return
 924    true if they are, otherwise return false and indicate in *MATCHES
 925    which stmts are not isomorphic to the first one.  If MATCHES[0]
 926    is false then this indicates the comparison could not be
 927    carried out or the stmts will never be vectorized by SLP.
 928
 929    Note COND_EXPR is possibly isomorphic to another one after swapping its
 930    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 931    the first stmt by swapping the two operands of comparison; set SWAP[i]
 932    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 933    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 934    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 935
 936 static bool
 937 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 938                        vec<stmt_vec_info> stmts, unsigned int group_size,
 939                        poly_uint64 *max_nunits, bool *matches,
 940                        bool *two_operators, tree *node_vectype)
 941 {
 942   unsigned int i;
 943   stmt_vec_info first_stmt_info = stmts[0];
 944   code_helper first_stmt_code = ERROR_MARK;
 945   code_helper alt_stmt_code = ERROR_MARK;
 946   code_helper rhs_code = ERROR_MARK;
 947   code_helper first_cond_code = ERROR_MARK;
 948   tree lhs;
 949   bool need_same_oprnds = false;
 950   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 951   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 952   bool first_stmt_load_p = false, load_p = false;
 953   bool first_stmt_phi_p = false, phi_p = false;
 954   bool maybe_soft_fail = false;
 955   tree soft_fail_nunits_vectype = NULL_TREE;
 956
 957   /* For every stmt in NODE find its def stmt/s.  */
 958   stmt_vec_info stmt_info;
 959   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 960     {
 961       gimple *stmt = stmt_info->stmt;
 962       swap[i] = 0;
 963       matches[i] = false;
 964
 965       if (dump_enabled_p ())
 966         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 967
 968       /* Fail to vectorize statements marked as unvectorizable, throw
 969          or are volatile.  */
 970       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
 971           || stmt_can_throw_internal (cfun, stmt)
 972           || gimple_has_volatile_ops (stmt))
 973         {
 974           if (dump_enabled_p ())
 975             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 976                              "Build SLP failed: unvectorizable statement %G",
 977                              stmt);
 978           /* ???  For BB vectorization we want to commutate operands in a way
 979              to shuffle all unvectorizable defs into one operand and have
 980              the other still vectorized.  The following doesn't reliably
 981              work for this though but it's the easiest we can do here.  */
 982           if (is_a <bb_vec_info> (vinfo) && i != 0)
 983             continue;
 984           /* Fatal mismatch.  */
 985           matches[0] = false;
 986           return false;
 987         }
 988
 989       lhs = gimple_get_lhs (stmt);
 990       if (lhs == NULL_TREE)
 991         {
 992           if (dump_enabled_p ())
 993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 994                              "Build SLP failed: not GIMPLE_ASSIGN nor "
 995                              "GIMPLE_CALL %G", stmt);
 996           if (is_a <bb_vec_info> (vinfo) && i != 0)
 997             continue;
 998           /* Fatal mismatch.  */
 999           matches[0] = false;
1000           return false;
1001         }
1002
1003       tree nunits_vectype;
1004       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1005                                            &nunits_vectype, group_size))
1006         {
1007           if (is_a <bb_vec_info> (vinfo) && i != 0)
1008             continue;
1009           /* Fatal mismatch.  */
1010           matches[0] = false;
1011           return false;
1012         }
1013       /* Record nunits required but continue analysis, producing matches[]
1014          as if nunits was not an issue.  This allows splitting of groups
1015          to happen.  */
1016       if (nunits_vectype
1017           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1018                                       nunits_vectype, max_nunits))
1019         {
1020           gcc_assert (is_a <bb_vec_info> (vinfo));
1021           maybe_soft_fail = true;
1022           soft_fail_nunits_vectype = nunits_vectype;
1023         }
1024
1025       gcc_assert (vectype);
1026
1027       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1028       if (call_stmt)
1029         {
1030           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1031           if (cfn != CFN_LAST)
1032             rhs_code = cfn;
1033           else
1034             rhs_code = CALL_EXPR;
1035
1036           if (cfn == CFN_MASK_LOAD
1037               || cfn == CFN_GATHER_LOAD
1038               || cfn == CFN_MASK_GATHER_LOAD)
1039             load_p = true;
1040           else if ((internal_fn_p (cfn)
1041                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1042                    || gimple_call_tail_p (call_stmt)
1043                    || gimple_call_noreturn_p (call_stmt)
1044                    || gimple_call_chain (call_stmt))
1045             {
1046               if (dump_enabled_p ())
1047                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048                                  "Build SLP failed: unsupported call type %G",
1049                                  (gimple *) call_stmt);
1050               if (is_a <bb_vec_info> (vinfo) && i != 0)
1051                 continue;
1052               /* Fatal mismatch.  */
1053               matches[0] = false;
1054               return false;
1055             }
1056         }
1057       else if (gimple_code (stmt) == GIMPLE_PHI)
1058         {
1059           rhs_code = ERROR_MARK;
1060           phi_p = true;
1061         }
1062       else
1063         {
1064           rhs_code = gimple_assign_rhs_code (stmt);
1065           load_p = gimple_vuse (stmt);
1066         }
1067
1068       /* Check the operation.  */
1069       if (i == 0)
1070         {
1071           *node_vectype = vectype;
1072           first_stmt_code = rhs_code;
1073           first_stmt_load_p = load_p;
1074           first_stmt_phi_p = phi_p;
1075
1076           /* Shift arguments should be equal in all the packed stmts for a
1077              vector shift with scalar shift operand.  */
1078           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1079               || rhs_code == LROTATE_EXPR
1080               || rhs_code == RROTATE_EXPR)
1081             {
1082               /* First see if we have a vector/vector shift.  */
1083               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1084                 {
1085                   /* No vector/vector shift, try for a vector/scalar shift.  */
1086                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1087                     {
1088                       if (dump_enabled_p ())
1089                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090                                          "Build SLP failed: "
1091                                          "op not supported by target.\n");
1092                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1093                         continue;
1094                       /* Fatal mismatch.  */
1095                       matches[0] = false;
1096                       return false;
1097                     }
1098                   need_same_oprnds = true;
1099                   first_op1 = gimple_assign_rhs2 (stmt);
1100                 }
1101             }
1102           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1103             {
1104               need_same_oprnds = true;
1105               first_op1 = gimple_assign_rhs2 (stmt);
1106             }
1107           else if (!load_p
1108                    && rhs_code == BIT_FIELD_REF)
1109             {
1110               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1111               if (!is_a <bb_vec_info> (vinfo)
1112                   || TREE_CODE (vec) != SSA_NAME
1113                   /* When the element types are not compatible we pun the
1114                      source to the target vectype which requires equal size.  */
1115                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1116                        || !types_compatible_p (TREE_TYPE (vectype),
1117                                                TREE_TYPE (TREE_TYPE (vec))))
1118                       && !operand_equal_p (TYPE_SIZE (vectype),
1119                                            TYPE_SIZE (TREE_TYPE (vec)))))
1120                 {
1121                   if (dump_enabled_p ())
1122                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123                                      "Build SLP failed: "
1124                                      "BIT_FIELD_REF not supported\n");
1125                   /* Fatal mismatch.  */
1126                   matches[0] = false;
1127                   return false;
1128                 }
1129             }
1130           else if (rhs_code == CFN_DIV_POW2)
1131             {
1132               need_same_oprnds = true;
1133               first_op1 = gimple_call_arg (call_stmt, 1);
1134             }
1135         }
1136       else
1137         {
1138           if (first_stmt_code != rhs_code
1139               && alt_stmt_code == ERROR_MARK)
1140             alt_stmt_code = rhs_code;
1141           if ((first_stmt_code != rhs_code
1142                && (first_stmt_code != IMAGPART_EXPR
1143                    || rhs_code != REALPART_EXPR)
1144                && (first_stmt_code != REALPART_EXPR
1145                    || rhs_code != IMAGPART_EXPR)
1146                /* Handle mismatches in plus/minus by computing both
1147                   and merging the results.  */
1148                && !((first_stmt_code == PLUS_EXPR
1149                      || first_stmt_code == MINUS_EXPR)
1150                     && (alt_stmt_code == PLUS_EXPR
1151                         || alt_stmt_code == MINUS_EXPR)
1152                     && rhs_code == alt_stmt_code)
1153                && !(first_stmt_code.is_tree_code ()
1154                     && rhs_code.is_tree_code ()
1155                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1156                         == tcc_comparison)
1157                     && (swap_tree_comparison (tree_code (first_stmt_code))
1158                         == tree_code (rhs_code)))
1159                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160                     && (first_stmt_code == ARRAY_REF
1161                         || first_stmt_code == BIT_FIELD_REF
1162                         || first_stmt_code == INDIRECT_REF
1163                         || first_stmt_code == COMPONENT_REF
1164                         || first_stmt_code == MEM_REF)
1165                     && (rhs_code == ARRAY_REF
1166                         || rhs_code == BIT_FIELD_REF
1167                         || rhs_code == INDIRECT_REF
1168                         || rhs_code == COMPONENT_REF
1169                         || rhs_code == MEM_REF)))
1170               || first_stmt_load_p != load_p
1171               || first_stmt_phi_p != phi_p)
1172             {
1173               if (dump_enabled_p ())
1174                 {
1175                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176                                    "Build SLP failed: different operation "
1177                                    "in stmt %G", stmt);
1178                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179                                    "original stmt %G", first_stmt_info->stmt);
1180                 }
1181               /* Mismatch.  */
1182               continue;
1183             }
1184
1185           if (!load_p
1186               && first_stmt_code == BIT_FIELD_REF
1187               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1188                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1189             {
1190               if (dump_enabled_p ())
1191                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                                  "Build SLP failed: different BIT_FIELD_REF "
1193                                  "arguments in %G", stmt);
1194               /* Mismatch.  */
1195               continue;
1196             }
1197
1198           if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1199             {
1200               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1201                                        call_stmt))
1202                 {
1203                   if (dump_enabled_p ())
1204                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205                                      "Build SLP failed: different calls in %G",
1206                                      stmt);
1207                   /* Mismatch.  */
1208                   continue;
1209                 }
1210             }
1211
1212           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1213               && (gimple_bb (first_stmt_info->stmt)
1214                   != gimple_bb (stmt_info->stmt)))
1215             {
1216               if (dump_enabled_p ())
1217                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                                  "Build SLP failed: different BB for PHI "
1219                                  "or possibly trapping operation in %G", stmt);
1220               /* Mismatch.  */
1221               continue;
1222             }
1223
1224           if (need_same_oprnds)
1225             {
1226               tree other_op1 = gimple_arg (stmt, 1);
1227               if (!operand_equal_p (first_op1, other_op1, 0))
1228                 {
1229                   if (dump_enabled_p ())
1230                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231                                      "Build SLP failed: different shift "
1232                                      "arguments in %G", stmt);
1233                   /* Mismatch.  */
1234                   continue;
1235                 }
1236             }
1237
1238           if (!types_compatible_p (vectype, *node_vectype))
1239             {
1240               if (dump_enabled_p ())
1241                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                                  "Build SLP failed: different vector type "
1243                                  "in %G", stmt);
1244               /* Mismatch.  */
1245               continue;
1246             }
1247         }
1248
1249       /* Grouped store or load.  */
1250       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1251         {
1252           if (REFERENCE_CLASS_P (lhs))
1253             {
1254               /* Store.  */
1255               ;
1256             }
1257           else
1258             {
1259               /* Load.  */
1260               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1261               if (prev_first_load)
1262                 {
1263                   /* Check that there are no loads from different interleaving
1264                      chains in the same node.  */
1265                   if (prev_first_load != first_load)
1266                     {
1267                       if (dump_enabled_p ())
1268                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1269                                          vect_location,
1270                                          "Build SLP failed: different "
1271                                          "interleaving chains in one node %G",
1272                                          stmt);
1273                       /* Mismatch.  */
1274                       continue;
1275                     }
1276                 }
1277               else
1278                 prev_first_load = first_load;
1279            }
1280         } /* Grouped access.  */
1281       else
1282         {
1283           if (load_p
1284               && rhs_code != CFN_GATHER_LOAD
1285               && rhs_code != CFN_MASK_GATHER_LOAD)
1286             {
1287               /* Not grouped load.  */
1288               if (dump_enabled_p ())
1289                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                                  "Build SLP failed: not grouped load %G", stmt);
1291
1292               /* FORNOW: Not grouped loads are not supported.  */
1293               if (is_a <bb_vec_info> (vinfo) && i != 0)
1294                 continue;
1295               /* Fatal mismatch.  */
1296               matches[0] = false;
1297               return false;
1298             }
1299
1300           /* Not memory operation.  */
1301           if (!phi_p
1302               && rhs_code.is_tree_code ()
1303               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1304               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1305               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1306               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1307               && rhs_code != VIEW_CONVERT_EXPR
1308               && rhs_code != CALL_EXPR
1309               && rhs_code != BIT_FIELD_REF)
1310             {
1311               if (dump_enabled_p ())
1312                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313                                  "Build SLP failed: operation unsupported %G",
1314                                  stmt);
1315               if (is_a <bb_vec_info> (vinfo) && i != 0)
1316                 continue;
1317               /* Fatal mismatch.  */
1318               matches[0] = false;
1319               return false;
1320             }
1321
1322           if (rhs_code == COND_EXPR)
1323             {
1324               tree cond_expr = gimple_assign_rhs1 (stmt);
1325               enum tree_code cond_code = TREE_CODE (cond_expr);
1326               enum tree_code swap_code = ERROR_MARK;
1327               enum tree_code invert_code = ERROR_MARK;
1328
1329               if (i == 0)
1330                 first_cond_code = TREE_CODE (cond_expr);
1331               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1332                 {
1333                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1334                   swap_code = swap_tree_comparison (cond_code);
1335                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1336                 }
1337
1338               if (first_cond_code == cond_code)
1339                 ;
1340               /* Isomorphic can be achieved by swapping.  */
1341               else if (first_cond_code == swap_code)
1342                 swap[i] = 1;
1343               /* Isomorphic can be achieved by inverting.  */
1344               else if (first_cond_code == invert_code)
1345                 swap[i] = 2;
1346               else
1347                 {
1348                   if (dump_enabled_p ())
1349                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                                      "Build SLP failed: different"
1351                                      " operation %G", stmt);
1352                   /* Mismatch.  */
1353                   continue;
1354                 }
1355             }
1356
1357           if (rhs_code.is_tree_code ()
1358               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1359               && (swap_tree_comparison ((tree_code)first_stmt_code)
1360                   == (tree_code)rhs_code))
1361             swap[i] = 1;
1362         }
1363
1364       matches[i] = true;
1365     }
1366
1367   for (i = 0; i < group_size; ++i)
1368     if (!matches[i])
1369       return false;
1370
1371   /* If we allowed a two-operation SLP node verify the target can cope
1372      with the permute we are going to use.  */
1373   if (alt_stmt_code != ERROR_MARK
1374       && (!alt_stmt_code.is_tree_code ()
1375           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1376               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1377     {
1378       *two_operators = true;
1379     }
1380
1381   if (maybe_soft_fail)
1382     {
1383       unsigned HOST_WIDE_INT const_nunits;
1384       if (!TYPE_VECTOR_SUBPARTS
1385             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1386           || const_nunits > group_size)
1387         matches[0] = false;
1388       else
1389         {
1390           /* With constant vector elements simulate a mismatch at the
1391              point we need to split.  */
1392           unsigned tail = group_size & (const_nunits - 1);
1393           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1394         }
1395       return false;
1396     }
1397
1398   return true;
1399 }
1400
1401 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1402    Note we never remove apart from at destruction time so we do not
1403    need a special value for deleted that differs from empty.  */
1404 struct bst_traits
1405 {
1406   typedef vec <stmt_vec_info> value_type;
1407   typedef vec <stmt_vec_info> compare_type;
1408   static inline hashval_t hash (value_type);
1409   static inline bool equal (value_type existing, value_type candidate);
1410   static inline bool is_empty (value_type x) { return !x.exists (); }
1411   static inline bool is_deleted (value_type x) { return !x.exists (); }
1412   static const bool empty_zero_p = true;
1413   static inline void mark_empty (value_type &x) { x.release (); }
1414   static inline void mark_deleted (value_type &x) { x.release (); }
1415   static inline void remove (value_type &x) { x.release (); }
1416 };
1417 inline hashval_t
1418 bst_traits::hash (value_type x)
1419 {
1420   inchash::hash h;
1421   for (unsigned i = 0; i < x.length (); ++i)
1422     h.add_int (gimple_uid (x[i]->stmt));
1423   return h.end ();
1424 }
1425 inline bool
1426 bst_traits::equal (value_type existing, value_type candidate)
1427 {
1428   if (existing.length () != candidate.length ())
1429     return false;
1430   for (unsigned i = 0; i < existing.length (); ++i)
1431     if (existing[i] != candidate[i])
1432       return false;
1433   return true;
1434 }
1435
1436 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1437    but then vec::insert does memmove and that's not compatible with
1438    std::pair.  */
1439 struct chain_op_t
1440 {
1441   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1442       : code (code_), dt (dt_), op (op_) {}
1443   tree_code code;
1444   vect_def_type dt;
1445   tree op;
1446 };
1447
1448 /* Comparator for sorting associatable chains.  */
1449
1450 static int
1451 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1452 {
1453   auto *op1 = (const chain_op_t *) op1_;
1454   auto *op2 = (const chain_op_t *) op2_;
1455   if (op1->dt != op2->dt)
1456     return (int)op1->dt - (int)op2->dt;
1457   return (int)op1->code - (int)op2->code;
1458 }
1459
1460 /* Linearize the associatable expression chain at START with the
1461    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1462    filling CHAIN with the result and using WORKLIST as intermediate storage.
1463    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1464    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1465    stmts, starting with START.  */
1466
1467 static void
1468 vect_slp_linearize_chain (vec_info *vinfo,
1469                           vec<std::pair<tree_code, gimple *> > &worklist,
1470                           vec<chain_op_t> &chain,
1471                           enum tree_code code, gimple *start,
1472                           gimple *&code_stmt, gimple *&alt_code_stmt,
1473                           vec<gimple *> *chain_stmts)
1474 {
1475   /* For each lane linearize the addition/subtraction (or other
1476      uniform associatable operation) expression tree.  */
1477   worklist.safe_push (std::make_pair (code, start));
1478   while (!worklist.is_empty ())
1479     {
1480       auto entry = worklist.pop ();
1481       gassign *stmt = as_a <gassign *> (entry.second);
1482       enum tree_code in_code = entry.first;
1483       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1484       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1485       if (!code_stmt
1486           && gimple_assign_rhs_code (stmt) == code)
1487         code_stmt = stmt;
1488       else if (!alt_code_stmt
1489                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1490         alt_code_stmt = stmt;
1491       if (chain_stmts)
1492         chain_stmts->safe_push (stmt);
1493       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1494         {
1495           tree op = gimple_op (stmt, opnum);
1496           vect_def_type dt;
1497           stmt_vec_info def_stmt_info;
1498           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1499           gcc_assert (res);
1500           if (dt == vect_internal_def
1501               && is_pattern_stmt_p (def_stmt_info))
1502             op = gimple_get_lhs (def_stmt_info->stmt);
1503           gimple *use_stmt;
1504           use_operand_p use_p;
1505           if (dt == vect_internal_def
1506               && single_imm_use (op, &use_p, &use_stmt)
1507               && is_gimple_assign (def_stmt_info->stmt)
1508               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1509                   || (code == PLUS_EXPR
1510                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1511                           == MINUS_EXPR))))
1512             {
1513               tree_code op_def_code = this_code;
1514               if (op_def_code == MINUS_EXPR && opnum == 1)
1515                 op_def_code = PLUS_EXPR;
1516               if (in_code == MINUS_EXPR)
1517                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1518               worklist.safe_push (std::make_pair (op_def_code,
1519                                                   def_stmt_info->stmt));
1520             }
1521           else
1522             {
1523               tree_code op_def_code = this_code;
1524               if (op_def_code == MINUS_EXPR && opnum == 1)
1525                 op_def_code = PLUS_EXPR;
1526               if (in_code == MINUS_EXPR)
1527                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1528               chain.safe_push (chain_op_t (op_def_code, dt, op));
1529             }
1530         }
1531     }
1532 }
1533
1534 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1535                   simple_hashmap_traits <bst_traits, slp_tree> >
1536   scalar_stmts_to_slp_tree_map_t;
1537
1538 static slp_tree
1539 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1540                        vec<stmt_vec_info> stmts, unsigned int group_size,
1541                        poly_uint64 *max_nunits,
1542                        bool *matches, unsigned *limit, unsigned *tree_size,
1543                        scalar_stmts_to_slp_tree_map_t *bst_map);
1544
1545 static slp_tree
1546 vect_build_slp_tree (vec_info *vinfo,
1547                      vec<stmt_vec_info> stmts, unsigned int group_size,
1548                      poly_uint64 *max_nunits,
1549                      bool *matches, unsigned *limit, unsigned *tree_size,
1550                      scalar_stmts_to_slp_tree_map_t *bst_map)
1551 {
1552   if (slp_tree *leader = bst_map->get (stmts))
1553     {
1554       if (dump_enabled_p ())
1555         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1556                          !(*leader)->failed ? "" : "failed ",
1557                          (void *) *leader);
1558       if (!(*leader)->failed)
1559         {
1560           SLP_TREE_REF_COUNT (*leader)++;
1561           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1562           stmts.release ();
1563           return *leader;
1564         }
1565       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1566       return NULL;
1567     }
1568
1569   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1570      so we can pick up backedge destinations during discovery.  */
1571   slp_tree res = new _slp_tree;
1572   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1573   SLP_TREE_SCALAR_STMTS (res) = stmts;
1574   bst_map->put (stmts.copy (), res);
1575
1576   if (*limit == 0)
1577     {
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_NOTE, vect_location,
1580                          "SLP discovery limit exceeded\n");
1581       /* Mark the node invalid so we can detect those when still in use
1582          as backedge destinations.  */
1583       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1584       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1585       res->failed = XNEWVEC (bool, group_size);
1586       memset (res->failed, 0, sizeof (bool) * group_size);
1587       memset (matches, 0, sizeof (bool) * group_size);
1588       return NULL;
1589     }
1590   --*limit;
1591
1592   if (dump_enabled_p ())
1593     dump_printf_loc (MSG_NOTE, vect_location,
1594                      "starting SLP discovery for node %p\n", (void *) res);
1595
1596   poly_uint64 this_max_nunits = 1;
1597   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1598                                         &this_max_nunits,
1599                                         matches, limit, tree_size, bst_map);
1600   if (!res_)
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "SLP discovery for node %p failed\n", (void *) res);
1605       /* Mark the node invalid so we can detect those when still in use
1606          as backedge destinations.  */
1607       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1608       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1609       res->failed = XNEWVEC (bool, group_size);
1610       if (flag_checking)
1611         {
1612           unsigned i;
1613           for (i = 0; i < group_size; ++i)
1614             if (!matches[i])
1615               break;
1616           gcc_assert (i < group_size);
1617         }
1618       memcpy (res->failed, matches, sizeof (bool) * group_size);
1619     }
1620   else
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_NOTE, vect_location,
1624                          "SLP discovery for node %p succeeded\n",
1625                          (void *) res);
1626       gcc_assert (res_ == res);
1627       res->max_nunits = this_max_nunits;
1628       vect_update_max_nunits (max_nunits, this_max_nunits);
1629       /* Keep a reference for the bst_map use.  */
1630       SLP_TREE_REF_COUNT (res)++;
1631     }
1632   return res_;
1633 }
1634
1635 /* Helper for building an associated SLP node chain.  */
1636
1637 static void
1638 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1639                                    slp_tree op0, slp_tree op1,
1640                                    stmt_vec_info oper1, stmt_vec_info oper2,
1641                                    vec<std::pair<unsigned, unsigned> > lperm)
1642 {
1643   unsigned group_size = SLP_TREE_LANES (op1);
1644
1645   slp_tree child1 = new _slp_tree;
1646   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1647   SLP_TREE_VECTYPE (child1) = vectype;
1648   SLP_TREE_LANES (child1) = group_size;
1649   SLP_TREE_CHILDREN (child1).create (2);
1650   SLP_TREE_CHILDREN (child1).quick_push (op0);
1651   SLP_TREE_CHILDREN (child1).quick_push (op1);
1652   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1653
1654   slp_tree child2 = new _slp_tree;
1655   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1656   SLP_TREE_VECTYPE (child2) = vectype;
1657   SLP_TREE_LANES (child2) = group_size;
1658   SLP_TREE_CHILDREN (child2).create (2);
1659   SLP_TREE_CHILDREN (child2).quick_push (op0);
1660   SLP_TREE_REF_COUNT (op0)++;
1661   SLP_TREE_CHILDREN (child2).quick_push (op1);
1662   SLP_TREE_REF_COUNT (op1)++;
1663   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1664
1665   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1666   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1667   SLP_TREE_VECTYPE (perm) = vectype;
1668   SLP_TREE_LANES (perm) = group_size;
1669   /* ???  We should set this NULL but that's not expected.  */
1670   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1671   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1672   SLP_TREE_CHILDREN (perm).quick_push (child1);
1673   SLP_TREE_CHILDREN (perm).quick_push (child2);
1674 }
1675
1676 /* Recursively build an SLP tree starting from NODE.
1677    Fail (and return a value not equal to zero) if def-stmts are not
1678    isomorphic, require data permutation or are of unsupported types of
1679    operation.  Otherwise, return 0.
1680    The value returned is the depth in the SLP tree where a mismatch
1681    was found.  */
1682
1683 static slp_tree
1684 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1685                        vec<stmt_vec_info> stmts, unsigned int group_size,
1686                        poly_uint64 *max_nunits,
1687                        bool *matches, unsigned *limit, unsigned *tree_size,
1688                        scalar_stmts_to_slp_tree_map_t *bst_map)
1689 {
1690   unsigned nops, i, this_tree_size = 0;
1691   poly_uint64 this_max_nunits = *max_nunits;
1692
1693   matches[0] = false;
1694
1695   stmt_vec_info stmt_info = stmts[0];
1696   if (!is_a<gcall *> (stmt_info->stmt)
1697       && !is_a<gassign *> (stmt_info->stmt)
1698       && !is_a<gphi *> (stmt_info->stmt))
1699     return NULL;
1700
1701   nops = gimple_num_args (stmt_info->stmt);
1702   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1703     nops = map[0];
1704
1705   /* If the SLP node is a PHI (induction or reduction), terminate
1706      the recursion.  */
1707   bool *skip_args = XALLOCAVEC (bool, nops);
1708   memset (skip_args, 0, sizeof (bool) * nops);
1709   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1710     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1711       {
1712         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1713         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1714                                                     group_size);
1715         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1716                                      max_nunits))
1717           return NULL;
1718
1719         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1720         if (def_type == vect_induction_def)
1721           {
1722             /* Induction PHIs are not cycles but walk the initial
1723                value.  Only for inner loops through, for outer loops
1724                we need to pick up the value from the actual PHIs
1725                to more easily support peeling and epilogue vectorization.  */
1726             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1727             if (!nested_in_vect_loop_p (loop, stmt_info))
1728               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1729             else
1730               loop = loop->inner;
1731             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1732           }
1733         else if (def_type == vect_reduction_def
1734                  || def_type == vect_double_reduction_def
1735                  || def_type == vect_nested_cycle)
1736           {
1737             /* Else def types have to match.  */
1738             stmt_vec_info other_info;
1739             bool all_same = true;
1740             FOR_EACH_VEC_ELT (stmts, i, other_info)
1741               {
1742                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1743                   return NULL;
1744                 if (other_info != stmt_info)
1745                   all_same = false;
1746               }
1747             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1748             /* Reduction initial values are not explicitely represented.  */
1749             if (!nested_in_vect_loop_p (loop, stmt_info))
1750               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1751             /* Reduction chain backedge defs are filled manually.
1752                ???  Need a better way to identify a SLP reduction chain PHI.
1753                Or a better overall way to SLP match those.  */
1754             if (all_same && def_type == vect_reduction_def)
1755               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1756           }
1757         else if (def_type != vect_internal_def)
1758           return NULL;
1759       }
1760
1761
1762   bool two_operators = false;
1763   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1764   tree vectype = NULL_TREE;
1765   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1766                               &this_max_nunits, matches, &two_operators,
1767                               &vectype))
1768     return NULL;
1769
1770   /* If the SLP node is a load, terminate the recursion unless masked.  */
1771   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1772       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1773     {
1774       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1775         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1776                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1777                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1778       else
1779         {
1780           *max_nunits = this_max_nunits;
1781           (*tree_size)++;
1782           node = vect_create_new_slp_node (node, stmts, 0);
1783           SLP_TREE_VECTYPE (node) = vectype;
1784           /* And compute the load permutation.  Whether it is actually
1785              a permutation depends on the unrolling factor which is
1786              decided later.  */
1787           vec<unsigned> load_permutation;
1788           int j;
1789           stmt_vec_info load_info;
1790           load_permutation.create (group_size);
1791           stmt_vec_info first_stmt_info
1792             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1793           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1794             {
1795               int load_place = vect_get_place_in_interleaving_chain
1796                   (load_info, first_stmt_info);
1797               gcc_assert (load_place != -1);
1798               load_permutation.safe_push (load_place);
1799             }
1800           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1801           return node;
1802         }
1803     }
1804   else if (gimple_assign_single_p (stmt_info->stmt)
1805            && !gimple_vuse (stmt_info->stmt)
1806            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1807     {
1808       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1809          the same SSA name vector of a compatible type to vectype.  */
1810       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1811       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1812       stmt_vec_info estmt_info;
1813       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1814         {
1815           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1816           tree bfref = gimple_assign_rhs1 (estmt);
1817           HOST_WIDE_INT lane;
1818           if (!known_eq (bit_field_size (bfref),
1819                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1820               || !constant_multiple_p (bit_field_offset (bfref),
1821                                        bit_field_size (bfref), &lane))
1822             {
1823               lperm.release ();
1824               matches[0] = false;
1825               return NULL;
1826             }
1827           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1828         }
1829       slp_tree vnode = vect_create_new_slp_node (vNULL);
1830       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1831         /* ???  We record vectype here but we hide eventually necessary
1832            punning and instead rely on code generation to materialize
1833            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1834            this explicit somehow.  */
1835         SLP_TREE_VECTYPE (vnode) = vectype;
1836       else
1837         {
1838           /* For different size but compatible elements we can still
1839              use VEC_PERM_EXPR without punning.  */
1840           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1841                       && types_compatible_p (TREE_TYPE (vectype),
1842                                              TREE_TYPE (TREE_TYPE (vec))));
1843           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1844         }
1845       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1846       unsigned HOST_WIDE_INT const_nunits;
1847       if (nunits.is_constant (&const_nunits))
1848         SLP_TREE_LANES (vnode) = const_nunits;
1849       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1850       /* We are always building a permutation node even if it is an identity
1851          permute to shield the rest of the vectorizer from the odd node
1852          representing an actual vector without any scalar ops.
1853          ???  We could hide it completely with making the permute node
1854          external?  */
1855       node = vect_create_new_slp_node (node, stmts, 1);
1856       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1857       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1858       SLP_TREE_VECTYPE (node) = vectype;
1859       SLP_TREE_CHILDREN (node).quick_push (vnode);
1860       return node;
1861     }
1862   /* When discovery reaches an associatable operation see whether we can
1863      improve that to match up lanes in a way superior to the operand
1864      swapping code which at most looks at two defs.
1865      ???  For BB vectorization we cannot do the brute-force search
1866      for matching as we can succeed by means of builds from scalars
1867      and have no good way to "cost" one build against another.  */
1868   else if (is_a <loop_vec_info> (vinfo)
1869            /* ???  We don't handle !vect_internal_def defs below.  */
1870            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1871            && is_gimple_assign (stmt_info->stmt)
1872            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1873                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1874            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1875                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1876                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1877     {
1878       /* See if we have a chain of (mixed) adds or subtracts or other
1879          associatable ops.  */
1880       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1881       if (code == MINUS_EXPR)
1882         code = PLUS_EXPR;
1883       stmt_vec_info other_op_stmt_info = NULL;
1884       stmt_vec_info op_stmt_info = NULL;
1885       unsigned chain_len = 0;
1886       auto_vec<chain_op_t> chain;
1887       auto_vec<std::pair<tree_code, gimple *> > worklist;
1888       auto_vec<vec<chain_op_t> > chains (group_size);
1889       auto_vec<slp_tree, 4> children;
1890       bool hard_fail = true;
1891       for (unsigned lane = 0; lane < group_size; ++lane)
1892         {
1893           /* For each lane linearize the addition/subtraction (or other
1894              uniform associatable operation) expression tree.  */
1895           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1896           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1897                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1898                                     NULL);
1899           if (!op_stmt_info && op_stmt)
1900             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1901           if (!other_op_stmt_info && other_op_stmt)
1902             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1903           if (chain.length () == 2)
1904             {
1905               /* In a chain of just two elements resort to the regular
1906                  operand swapping scheme.  If we run into a length
1907                  mismatch still hard-FAIL.  */
1908               if (chain_len == 0)
1909                 hard_fail = false;
1910               else
1911                 {
1912                   matches[lane] = false;
1913                   /* ???  We might want to process the other lanes, but
1914                      make sure to not give false matching hints to the
1915                      caller for lanes we did not process.  */
1916                   if (lane != group_size - 1)
1917                     matches[0] = false;
1918                 }
1919               break;
1920             }
1921           else if (chain_len == 0)
1922             chain_len = chain.length ();
1923           else if (chain.length () != chain_len)
1924             {
1925               /* ???  Here we could slip in magic to compensate with
1926                  neutral operands.  */
1927               matches[lane] = false;
1928               if (lane != group_size - 1)
1929                 matches[0] = false;
1930               break;
1931             }
1932           chains.quick_push (chain.copy ());
1933           chain.truncate (0);
1934         }
1935       if (chains.length () == group_size)
1936         {
1937           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1938           if (!op_stmt_info)
1939             {
1940               hard_fail = false;
1941               goto out;
1942             }
1943           /* Now we have a set of chains with the same length.  */
1944           /* 1. pre-sort according to def_type and operation.  */
1945           for (unsigned lane = 0; lane < group_size; ++lane)
1946             chains[lane].stablesort (dt_sort_cmp, vinfo);
1947           if (dump_enabled_p ())
1948             {
1949               dump_printf_loc (MSG_NOTE, vect_location,
1950                                "pre-sorted chains of %s\n",
1951                                get_tree_code_name (code));
1952               for (unsigned lane = 0; lane < group_size; ++lane)
1953                 {
1954                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1955                     dump_printf (MSG_NOTE, "%s %T ",
1956                                  get_tree_code_name (chains[lane][opnum].code),
1957                                  chains[lane][opnum].op);
1958                   dump_printf (MSG_NOTE, "\n");
1959                 }
1960             }
1961           /* 2. try to build children nodes, associating as necessary.  */
1962           for (unsigned n = 0; n < chain_len; ++n)
1963             {
1964               vect_def_type dt = chains[0][n].dt;
1965               unsigned lane;
1966               for (lane = 0; lane < group_size; ++lane)
1967                 if (chains[lane][n].dt != dt)
1968                   {
1969                     if (dt == vect_constant_def
1970                         && chains[lane][n].dt == vect_external_def)
1971                       dt = vect_external_def;
1972                     else if (dt == vect_external_def
1973                              && chains[lane][n].dt == vect_constant_def)
1974                       ;
1975                     else
1976                       break;
1977                   }
1978               if (lane != group_size)
1979                 {
1980                   if (dump_enabled_p ())
1981                     dump_printf_loc (MSG_NOTE, vect_location,
1982                                      "giving up on chain due to mismatched "
1983                                      "def types\n");
1984                   matches[lane] = false;
1985                   if (lane != group_size - 1)
1986                     matches[0] = false;
1987                   goto out;
1988                 }
1989               if (dt == vect_constant_def
1990                   || dt == vect_external_def)
1991                 {
1992                   /* Check whether we can build the invariant.  If we can't
1993                      we never will be able to.  */
1994                   tree type = TREE_TYPE (chains[0][n].op);
1995                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1996                       && (TREE_CODE (type) == BOOLEAN_TYPE
1997                           || !can_duplicate_and_interleave_p (vinfo, group_size,
1998                                                               type)))
1999                     {
2000                       matches[0] = false;
2001                       goto out;
2002                     }
2003                   vec<tree> ops;
2004                   ops.create (group_size);
2005                   for (lane = 0; lane < group_size; ++lane)
2006                     ops.quick_push (chains[lane][n].op);
2007                   slp_tree child = vect_create_new_slp_node (ops);
2008                   SLP_TREE_DEF_TYPE (child) = dt;
2009                   children.safe_push (child);
2010                 }
2011               else if (dt != vect_internal_def)
2012                 {
2013                   /* Not sure, we might need sth special.
2014                      gcc.dg/vect/pr96854.c,
2015                      gfortran.dg/vect/fast-math-pr37021.f90
2016                      and gfortran.dg/vect/pr61171.f trigger.  */
2017                   /* Soft-fail for now.  */
2018                   hard_fail = false;
2019                   goto out;
2020                 }
2021               else
2022                 {
2023                   vec<stmt_vec_info> op_stmts;
2024                   op_stmts.create (group_size);
2025                   slp_tree child = NULL;
2026                   /* Brute-force our way.  We have to consider a lane
2027                      failing after fixing an earlier fail up in the
2028                      SLP discovery recursion.  So track the current
2029                      permute per lane.  */
2030                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2031                   memset (perms, 0, sizeof (unsigned) * group_size);
2032                   do
2033                     {
2034                       op_stmts.truncate (0);
2035                       for (lane = 0; lane < group_size; ++lane)
2036                         op_stmts.quick_push
2037                           (vinfo->lookup_def (chains[lane][n].op));
2038                       child = vect_build_slp_tree (vinfo, op_stmts,
2039                                                    group_size, &this_max_nunits,
2040                                                    matches, limit,
2041                                                    &this_tree_size, bst_map);
2042                       /* ???  We're likely getting too many fatal mismatches
2043                          here so maybe we want to ignore them (but then we
2044                          have no idea which lanes fatally mismatched).  */
2045                       if (child || !matches[0])
2046                         break;
2047                       /* Swap another lane we have not yet matched up into
2048                          lanes that did not match.  If we run out of
2049                          permute possibilities for a lane terminate the
2050                          search.  */
2051                       bool term = false;
2052                       for (lane = 1; lane < group_size; ++lane)
2053                         if (!matches[lane])
2054                           {
2055                             if (n + perms[lane] + 1 == chain_len)
2056                               {
2057                                 term = true;
2058                                 break;
2059                               }
2060                             std::swap (chains[lane][n],
2061                                        chains[lane][n + perms[lane] + 1]);
2062                             perms[lane]++;
2063                           }
2064                       if (term)
2065                         break;
2066                     }
2067                   while (1);
2068                   if (!child)
2069                     {
2070                       if (dump_enabled_p ())
2071                         dump_printf_loc (MSG_NOTE, vect_location,
2072                                          "failed to match up op %d\n", n);
2073                       op_stmts.release ();
2074                       if (lane != group_size - 1)
2075                         matches[0] = false;
2076                       else
2077                         matches[lane] = false;
2078                       goto out;
2079                     }
2080                   if (dump_enabled_p ())
2081                     {
2082                       dump_printf_loc (MSG_NOTE, vect_location,
2083                                        "matched up op %d to\n", n);
2084                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2085                     }
2086                   children.safe_push (child);
2087                 }
2088             }
2089           /* 3. build SLP nodes to combine the chain.  */
2090           for (unsigned lane = 0; lane < group_size; ++lane)
2091             if (chains[lane][0].code != code)
2092               {
2093                 /* See if there's any alternate all-PLUS entry.  */
2094                 unsigned n;
2095                 for (n = 1; n < chain_len; ++n)
2096                   {
2097                     for (lane = 0; lane < group_size; ++lane)
2098                       if (chains[lane][n].code != code)
2099                         break;
2100                     if (lane == group_size)
2101                       break;
2102                   }
2103                 if (n != chain_len)
2104                   {
2105                     /* Swap that in at first position.  */
2106                     std::swap (children[0], children[n]);
2107                     for (lane = 0; lane < group_size; ++lane)
2108                       std::swap (chains[lane][0], chains[lane][n]);
2109                   }
2110                 else
2111                   {
2112                     /* ???  When this triggers and we end up with two
2113                        vect_constant/external_def up-front things break (ICE)
2114                        spectacularly finding an insertion place for the
2115                        all-constant op.  We should have a fully
2116                        vect_internal_def operand though(?) so we can swap
2117                        that into first place and then prepend the all-zero
2118                        constant.  */
2119                     if (dump_enabled_p ())
2120                       dump_printf_loc (MSG_NOTE, vect_location,
2121                                        "inserting constant zero to compensate "
2122                                        "for (partially) negated first "
2123                                        "operand\n");
2124                     chain_len++;
2125                     for (lane = 0; lane < group_size; ++lane)
2126                       chains[lane].safe_insert
2127                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2128                     vec<tree> zero_ops;
2129                     zero_ops.create (group_size);
2130                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2131                     for (lane = 1; lane < group_size; ++lane)
2132                       zero_ops.quick_push (zero_ops[0]);
2133                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2134                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2135                     children.safe_insert (0, zero);
2136                   }
2137                 break;
2138               }
2139           for (unsigned i = 1; i < children.length (); ++i)
2140             {
2141               slp_tree op0 = children[i - 1];
2142               slp_tree op1 = children[i];
2143               bool this_two_op = false;
2144               for (unsigned lane = 0; lane < group_size; ++lane)
2145                 if (chains[lane][i].code != chains[0][i].code)
2146                   {
2147                     this_two_op = true;
2148                     break;
2149                   }
2150               slp_tree child;
2151               if (i == children.length () - 1)
2152                 child = vect_create_new_slp_node (node, stmts, 2);
2153               else
2154                 child = vect_create_new_slp_node (2, ERROR_MARK);
2155               if (this_two_op)
2156                 {
2157                   vec<std::pair<unsigned, unsigned> > lperm;
2158                   lperm.create (group_size);
2159                   for (unsigned lane = 0; lane < group_size; ++lane)
2160                     lperm.quick_push (std::make_pair
2161                       (chains[lane][i].code != chains[0][i].code, lane));
2162                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2163                                                      (chains[0][i].code == code
2164                                                       ? op_stmt_info
2165                                                       : other_op_stmt_info),
2166                                                      (chains[0][i].code == code
2167                                                       ? other_op_stmt_info
2168                                                       : op_stmt_info),
2169                                                      lperm);
2170                 }
2171               else
2172                 {
2173                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2174                   SLP_TREE_VECTYPE (child) = vectype;
2175                   SLP_TREE_LANES (child) = group_size;
2176                   SLP_TREE_CHILDREN (child).quick_push (op0);
2177                   SLP_TREE_CHILDREN (child).quick_push (op1);
2178                   SLP_TREE_REPRESENTATIVE (child)
2179                     = (chains[0][i].code == code
2180                        ? op_stmt_info : other_op_stmt_info);
2181                 }
2182               children[i] = child;
2183             }
2184           *tree_size += this_tree_size + 1;
2185           *max_nunits = this_max_nunits;
2186           while (!chains.is_empty ())
2187             chains.pop ().release ();
2188           return node;
2189         }
2190 out:
2191       while (!children.is_empty ())
2192         vect_free_slp_tree (children.pop ());
2193       while (!chains.is_empty ())
2194         chains.pop ().release ();
2195       /* Hard-fail, otherwise we might run into quadratic processing of the
2196          chains starting one stmt into the chain again.  */
2197       if (hard_fail)
2198         return NULL;
2199       /* Fall thru to normal processing.  */
2200     }
2201
2202   /* Get at the operands, verifying they are compatible.  */
2203   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2204   slp_oprnd_info oprnd_info;
2205   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2206     {
2207       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2208                                              stmts, i, &oprnds_info);
2209       if (res != 0)
2210         matches[(res == -1) ? 0 : i] = false;
2211       if (!matches[0])
2212         break;
2213     }
2214   for (i = 0; i < group_size; ++i)
2215     if (!matches[i])
2216       {
2217         vect_free_oprnd_info (oprnds_info);
2218         return NULL;
2219       }
2220   swap = NULL;
2221
2222   auto_vec<slp_tree, 4> children;
2223
2224   stmt_info = stmts[0];
2225
2226   /* Create SLP_TREE nodes for the definition node/s.  */
2227   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2228     {
2229       slp_tree child;
2230       unsigned int j;
2231
2232       /* We're skipping certain operands from processing, for example
2233          outer loop reduction initial defs.  */
2234       if (skip_args[i])
2235         {
2236           children.safe_push (NULL);
2237           continue;
2238         }
2239
2240       if (oprnd_info->first_dt == vect_uninitialized_def)
2241         {
2242           /* COND_EXPR have one too many eventually if the condition
2243              is a SSA name.  */
2244           gcc_assert (i == 3 && nops == 4);
2245           continue;
2246         }
2247
2248       if (is_a <bb_vec_info> (vinfo)
2249           && oprnd_info->first_dt == vect_internal_def
2250           && !oprnd_info->any_pattern)
2251         {
2252           /* For BB vectorization, if all defs are the same do not
2253              bother to continue the build along the single-lane
2254              graph but use a splat of the scalar value.  */
2255           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2256           for (j = 1; j < group_size; ++j)
2257             if (oprnd_info->def_stmts[j] != first_def)
2258               break;
2259           if (j == group_size
2260               /* But avoid doing this for loads where we may be
2261                  able to CSE things, unless the stmt is not
2262                  vectorizable.  */
2263               && (!STMT_VINFO_VECTORIZABLE (first_def)
2264                   || !gimple_vuse (first_def->stmt)))
2265             {
2266               if (dump_enabled_p ())
2267                 dump_printf_loc (MSG_NOTE, vect_location,
2268                                  "Using a splat of the uniform operand %G",
2269                                  first_def->stmt);
2270               oprnd_info->first_dt = vect_external_def;
2271             }
2272         }
2273
2274       if (oprnd_info->first_dt == vect_external_def
2275           || oprnd_info->first_dt == vect_constant_def)
2276         {
2277           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2278           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2279           oprnd_info->ops = vNULL;
2280           children.safe_push (invnode);
2281           continue;
2282         }
2283
2284       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2285                                         group_size, &this_max_nunits,
2286                                         matches, limit,
2287                                         &this_tree_size, bst_map)) != NULL)
2288         {
2289           oprnd_info->def_stmts = vNULL;
2290           children.safe_push (child);
2291           continue;
2292         }
2293
2294       /* If the SLP build for operand zero failed and operand zero
2295          and one can be commutated try that for the scalar stmts
2296          that failed the match.  */
2297       if (i == 0
2298           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2299           && matches[0]
2300           /* ???  For COND_EXPRs we can swap the comparison operands
2301              as well as the arms under some constraints.  */
2302           && nops == 2
2303           && oprnds_info[1]->first_dt == vect_internal_def
2304           && is_gimple_assign (stmt_info->stmt)
2305           /* Swapping operands for reductions breaks assumptions later on.  */
2306           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2307           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2308         {
2309           /* See whether we can swap the matching or the non-matching
2310              stmt operands.  */
2311           bool swap_not_matching = true;
2312           do
2313             {
2314               for (j = 0; j < group_size; ++j)
2315                 {
2316                   if (matches[j] != !swap_not_matching)
2317                     continue;
2318                   stmt_vec_info stmt_info = stmts[j];
2319                   /* Verify if we can swap operands of this stmt.  */
2320                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2321                   if (!stmt
2322                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2323                     {
2324                       if (!swap_not_matching)
2325                         goto fail;
2326                       swap_not_matching = false;
2327                       break;
2328                     }
2329                 }
2330             }
2331           while (j != group_size);
2332
2333           /* Swap mismatched definition stmts.  */
2334           if (dump_enabled_p ())
2335             dump_printf_loc (MSG_NOTE, vect_location,
2336                              "Re-trying with swapped operands of stmts ");
2337           for (j = 0; j < group_size; ++j)
2338             if (matches[j] == !swap_not_matching)
2339               {
2340                 std::swap (oprnds_info[0]->def_stmts[j],
2341                            oprnds_info[1]->def_stmts[j]);
2342                 std::swap (oprnds_info[0]->ops[j],
2343                            oprnds_info[1]->ops[j]);
2344                 if (dump_enabled_p ())
2345                   dump_printf (MSG_NOTE, "%d ", j);
2346               }
2347           if (dump_enabled_p ())
2348             dump_printf (MSG_NOTE, "\n");
2349           /* After swapping some operands we lost track whether an
2350              operand has any pattern defs so be conservative here.  */
2351           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2352             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2353           /* And try again with scratch 'matches' ... */
2354           bool *tem = XALLOCAVEC (bool, group_size);
2355           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2356                                             group_size, &this_max_nunits,
2357                                             tem, limit,
2358                                             &this_tree_size, bst_map)) != NULL)
2359             {
2360               oprnd_info->def_stmts = vNULL;
2361               children.safe_push (child);
2362               continue;
2363             }
2364         }
2365 fail:
2366
2367       /* If the SLP build failed and we analyze a basic-block
2368          simply treat nodes we fail to build as externally defined
2369          (and thus build vectors from the scalar defs).
2370          The cost model will reject outright expensive cases.
2371          ???  This doesn't treat cases where permutation ultimatively
2372          fails (or we don't try permutation below).  Ideally we'd
2373          even compute a permutation that will end up with the maximum
2374          SLP tree size...  */
2375       if (is_a <bb_vec_info> (vinfo)
2376           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2377              do extra work to cancel the pattern so the uses see the
2378              scalar version.  */
2379           && !is_pattern_stmt_p (stmt_info)
2380           && !oprnd_info->any_pattern)
2381         {
2382           /* But if there's a leading vector sized set of matching stmts
2383              fail here so we can split the group.  This matches the condition
2384              vect_analyze_slp_instance uses.  */
2385           /* ???  We might want to split here and combine the results to support
2386              multiple vector sizes better.  */
2387           for (j = 0; j < group_size; ++j)
2388             if (!matches[j])
2389               break;
2390           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2391             {
2392               if (dump_enabled_p ())
2393                 dump_printf_loc (MSG_NOTE, vect_location,
2394                                  "Building vector operands from scalars\n");
2395               this_tree_size++;
2396               child = vect_create_new_slp_node (oprnd_info->ops);
2397               children.safe_push (child);
2398               oprnd_info->ops = vNULL;
2399               continue;
2400             }
2401         }
2402
2403       gcc_assert (child == NULL);
2404       FOR_EACH_VEC_ELT (children, j, child)
2405         if (child)
2406           vect_free_slp_tree (child);
2407       vect_free_oprnd_info (oprnds_info);
2408       return NULL;
2409     }
2410
2411   vect_free_oprnd_info (oprnds_info);
2412
2413   /* If we have all children of a child built up from uniform scalars
2414      or does more than one possibly expensive vector construction then
2415      just throw that away, causing it built up from scalars.
2416      The exception is the SLP node for the vector store.  */
2417   if (is_a <bb_vec_info> (vinfo)
2418       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2419       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2420          do extra work to cancel the pattern so the uses see the
2421          scalar version.  */
2422       && !is_pattern_stmt_p (stmt_info))
2423     {
2424       slp_tree child;
2425       unsigned j;
2426       bool all_uniform_p = true;
2427       unsigned n_vector_builds = 0;
2428       FOR_EACH_VEC_ELT (children, j, child)
2429         {
2430           if (!child)
2431             ;
2432           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2433             all_uniform_p = false;
2434           else if (!vect_slp_tree_uniform_p (child))
2435             {
2436               all_uniform_p = false;
2437               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2438                 n_vector_builds++;
2439             }
2440         }
2441       if (all_uniform_p
2442           || n_vector_builds > 1
2443           || (n_vector_builds == children.length ()
2444               && is_a <gphi *> (stmt_info->stmt)))
2445         {
2446           /* Roll back.  */
2447           matches[0] = false;
2448           FOR_EACH_VEC_ELT (children, j, child)
2449             if (child)
2450               vect_free_slp_tree (child);
2451
2452           if (dump_enabled_p ())
2453             dump_printf_loc (MSG_NOTE, vect_location,
2454                              "Building parent vector operands from "
2455                              "scalars instead\n");
2456           return NULL;
2457         }
2458     }
2459
2460   *tree_size += this_tree_size + 1;
2461   *max_nunits = this_max_nunits;
2462
2463   if (two_operators)
2464     {
2465       /* ???  We'd likely want to either cache in bst_map sth like
2466          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2467          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2468          explicit stmts to put in so the keying on 'stmts' doesn't
2469          work (but we have the same issue with nodes that use 'ops').  */
2470       slp_tree one = new _slp_tree;
2471       slp_tree two = new _slp_tree;
2472       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2473       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2474       SLP_TREE_VECTYPE (one) = vectype;
2475       SLP_TREE_VECTYPE (two) = vectype;
2476       SLP_TREE_CHILDREN (one).safe_splice (children);
2477       SLP_TREE_CHILDREN (two).safe_splice (children);
2478       slp_tree child;
2479       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2480         SLP_TREE_REF_COUNT (child)++;
2481
2482       /* Here we record the original defs since this
2483          node represents the final lane configuration.  */
2484       node = vect_create_new_slp_node (node, stmts, 2);
2485       SLP_TREE_VECTYPE (node) = vectype;
2486       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2487       SLP_TREE_CHILDREN (node).quick_push (one);
2488       SLP_TREE_CHILDREN (node).quick_push (two);
2489       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2490       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2491       enum tree_code ocode = ERROR_MARK;
2492       stmt_vec_info ostmt_info;
2493       unsigned j = 0;
2494       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2495         {
2496           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2497           if (gimple_assign_rhs_code (ostmt) != code0)
2498             {
2499               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2500               ocode = gimple_assign_rhs_code (ostmt);
2501               j = i;
2502             }
2503           else
2504             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2505         }
2506       SLP_TREE_CODE (one) = code0;
2507       SLP_TREE_CODE (two) = ocode;
2508       SLP_TREE_LANES (one) = stmts.length ();
2509       SLP_TREE_LANES (two) = stmts.length ();
2510       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2511       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2512       return node;
2513     }
2514
2515   node = vect_create_new_slp_node (node, stmts, nops);
2516   SLP_TREE_VECTYPE (node) = vectype;
2517   SLP_TREE_CHILDREN (node).splice (children);
2518   return node;
2519 }
2520
2521 /* Dump a single SLP tree NODE.  */
2522
2523 static void
2524 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2525                      slp_tree node)
2526 {
2527   unsigned i, j;
2528   slp_tree child;
2529   stmt_vec_info stmt_info;
2530   tree op;
2531
2532   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2533   dump_user_location_t user_loc = loc.get_user_location ();
2534   dump_printf_loc (metadata, user_loc,
2535                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2536                    ", refcnt=%u)",
2537                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2538                    ? " (external)"
2539                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2540                       ? " (constant)"
2541                       : ""), (void *) node,
2542                    estimated_poly_value (node->max_nunits),
2543                                          SLP_TREE_REF_COUNT (node));
2544   if (SLP_TREE_VECTYPE (node))
2545     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2546   dump_printf (metadata, "\n");
2547   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2548     {
2549       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2550         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2551       else
2552         dump_printf_loc (metadata, user_loc, "op template: %G",
2553                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2554     }
2555   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2556     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2557       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2558   else
2559     {
2560       dump_printf_loc (metadata, user_loc, "\t{ ");
2561       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2562         dump_printf (metadata, "%T%s ", op,
2563                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2564       dump_printf (metadata, "}\n");
2565     }
2566   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2567     {
2568       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2569       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2570         dump_printf (dump_kind, " %u", j);
2571       dump_printf (dump_kind, " }\n");
2572     }
2573   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2574     {
2575       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2576       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2577         dump_printf (dump_kind, " %u[%u]",
2578                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2579                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2580       dump_printf (dump_kind, " }\n");
2581     }
2582   if (SLP_TREE_CHILDREN (node).is_empty ())
2583     return;
2584   dump_printf_loc (metadata, user_loc, "\tchildren");
2585   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2586     dump_printf (dump_kind, " %p", (void *)child);
2587   dump_printf (dump_kind, "\n");
2588 }
2589
2590 DEBUG_FUNCTION void
2591 debug (slp_tree node)
2592 {
2593   debug_dump_context ctx;
2594   vect_print_slp_tree (MSG_NOTE,
2595                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2596                        node);
2597 }
2598
2599 /* Recursive helper for the dot producer below.  */
2600
2601 static void
2602 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2603 {
2604   if (visited.add (node))
2605     return;
2606
2607   fprintf (f, "\"%p\" [label=\"", (void *)node);
2608   vect_print_slp_tree (MSG_NOTE,
2609                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2610                        node);
2611   fprintf (f, "\"];\n");
2612
2613
2614   for (slp_tree child : SLP_TREE_CHILDREN (node))
2615     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2616
2617   for (slp_tree child : SLP_TREE_CHILDREN (node))
2618     if (child)
2619       dot_slp_tree (f, child, visited);
2620 }
2621
2622 DEBUG_FUNCTION void
2623 dot_slp_tree (const char *fname, slp_tree node)
2624 {
2625   FILE *f = fopen (fname, "w");
2626   fprintf (f, "digraph {\n");
2627   fflush (f);
2628     {
2629       debug_dump_context ctx (f);
2630       hash_set<slp_tree> visited;
2631       dot_slp_tree (f, node, visited);
2632     }
2633   fflush (f);
2634   fprintf (f, "}\n");
2635   fclose (f);
2636 }
2637
2638 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2639
2640 static void
2641 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2642                       slp_tree node, hash_set<slp_tree> &visited)
2643 {
2644   unsigned i;
2645   slp_tree child;
2646
2647   if (visited.add (node))
2648     return;
2649
2650   vect_print_slp_tree (dump_kind, loc, node);
2651
2652   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2653     if (child)
2654       vect_print_slp_graph (dump_kind, loc, child, visited);
2655 }
2656
2657 static void
2658 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2659                       slp_tree entry)
2660 {
2661   hash_set<slp_tree> visited;
2662   vect_print_slp_graph (dump_kind, loc, entry, visited);
2663 }
2664
2665 /* Mark the tree rooted at NODE with PURE_SLP.  */
2666
2667 static void
2668 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2669 {
2670   int i;
2671   stmt_vec_info stmt_info;
2672   slp_tree child;
2673
2674   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2675     return;
2676
2677   if (visited.add (node))
2678     return;
2679
2680   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2681     STMT_SLP_TYPE (stmt_info) = pure_slp;
2682
2683   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2684     if (child)
2685       vect_mark_slp_stmts (child, visited);
2686 }
2687
2688 static void
2689 vect_mark_slp_stmts (slp_tree node)
2690 {
2691   hash_set<slp_tree> visited;
2692   vect_mark_slp_stmts (node, visited);
2693 }
2694
2695 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2696
2697 static void
2698 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2699 {
2700   int i;
2701   stmt_vec_info stmt_info;
2702   slp_tree child;
2703
2704   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2705     return;
2706
2707   if (visited.add (node))
2708     return;
2709
2710   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2711     {
2712       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2713                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2714       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2715     }
2716
2717   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2718     if (child)
2719       vect_mark_slp_stmts_relevant (child, visited);
2720 }
2721
2722 static void
2723 vect_mark_slp_stmts_relevant (slp_tree node)
2724 {
2725   hash_set<slp_tree> visited;
2726   vect_mark_slp_stmts_relevant (node, visited);
2727 }
2728
2729
2730 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2731
2732 static void
2733 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2734                        hash_set<slp_tree> &visited)
2735 {
2736   if (!node || visited.add (node))
2737     return;
2738
2739   if (SLP_TREE_CHILDREN (node).length () == 0)
2740     {
2741       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2742         return;
2743       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2744       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2745           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2746         loads.safe_push (node);
2747     }
2748   else
2749     {
2750       unsigned i;
2751       slp_tree child;
2752       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2753         vect_gather_slp_loads (loads, child, visited);
2754     }
2755 }
2756
2757
2758 /* Find the last store in SLP INSTANCE.  */
2759
2760 stmt_vec_info
2761 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2762 {
2763   stmt_vec_info last = NULL;
2764   stmt_vec_info stmt_vinfo;
2765
2766   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2767     {
2768       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2769       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2770     }
2771
2772   return last;
2773 }
2774
2775 /* Find the first stmt in NODE.  */
2776
2777 stmt_vec_info
2778 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2779 {
2780   stmt_vec_info first = NULL;
2781   stmt_vec_info stmt_vinfo;
2782
2783   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2784     {
2785       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2786       if (!first
2787           || get_later_stmt (stmt_vinfo, first) == first)
2788         first = stmt_vinfo;
2789     }
2790
2791   return first;
2792 }
2793
2794 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2795    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2796    (also containing the first GROUP1_SIZE stmts, since stores are
2797    consecutive), the second containing the remainder.
2798    Return the first stmt in the second group.  */
2799
2800 static stmt_vec_info
2801 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2802 {
2803   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2804   gcc_assert (group1_size > 0);
2805   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2806   gcc_assert (group2_size > 0);
2807   DR_GROUP_SIZE (first_vinfo) = group1_size;
2808
2809   stmt_vec_info stmt_info = first_vinfo;
2810   for (unsigned i = group1_size; i > 1; i--)
2811     {
2812       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2813       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2814     }
2815   /* STMT is now the last element of the first group.  */
2816   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2817   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2818
2819   DR_GROUP_SIZE (group2) = group2_size;
2820   for (stmt_info = group2; stmt_info;
2821        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2822     {
2823       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2824       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2825     }
2826
2827   /* For the second group, the DR_GROUP_GAP is that before the original group,
2828      plus skipping over the first vector.  */
2829   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2830
2831   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2832   DR_GROUP_GAP (first_vinfo) += group2_size;
2833
2834   if (dump_enabled_p ())
2835     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2836                      group1_size, group2_size);
2837
2838   return group2;
2839 }
2840
2841 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2842    statements and a vector of NUNITS elements.  */
2843
2844 static poly_uint64
2845 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2846 {
2847   return exact_div (common_multiple (nunits, group_size), group_size);
2848 }
2849
2850 /* Helper that checks to see if a node is a load node.  */
2851
2852 static inline bool
2853 vect_is_slp_load_node  (slp_tree root)
2854 {
2855   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2856          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2857          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2858 }
2859
2860
2861 /* Helper function of optimize_load_redistribution that performs the operation
2862    recursively.  */
2863
2864 static slp_tree
2865 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2866                                 vec_info *vinfo, unsigned int group_size,
2867                                 hash_map<slp_tree, slp_tree> *load_map,
2868                                 slp_tree root)
2869 {
2870   if (slp_tree *leader = load_map->get (root))
2871     return *leader;
2872
2873   slp_tree node;
2874   unsigned i;
2875
2876   /* For now, we don't know anything about externals so do not do anything.  */
2877   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2878     return NULL;
2879   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2880     {
2881       /* First convert this node into a load node and add it to the leaves
2882          list and flatten the permute from a lane to a load one.  If it's
2883          unneeded it will be elided later.  */
2884       vec<stmt_vec_info> stmts;
2885       stmts.create (SLP_TREE_LANES (root));
2886       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2887       for (unsigned j = 0; j < lane_perm.length (); j++)
2888         {
2889           std::pair<unsigned, unsigned> perm = lane_perm[j];
2890           node = SLP_TREE_CHILDREN (root)[perm.first];
2891
2892           if (!vect_is_slp_load_node (node)
2893               || SLP_TREE_CHILDREN (node).exists ())
2894             {
2895               stmts.release ();
2896               goto next;
2897             }
2898
2899           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2900         }
2901
2902       if (dump_enabled_p ())
2903         dump_printf_loc (MSG_NOTE, vect_location,
2904                          "converting stmts on permute node %p\n",
2905                          (void *) root);
2906
2907       bool *matches = XALLOCAVEC (bool, group_size);
2908       poly_uint64 max_nunits = 1;
2909       unsigned tree_size = 0, limit = 1;
2910       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2911                                   matches, &limit, &tree_size, bst_map);
2912       if (!node)
2913         stmts.release ();
2914
2915       load_map->put (root, node);
2916       return node;
2917     }
2918
2919 next:
2920   load_map->put (root, NULL);
2921
2922   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2923     {
2924       slp_tree value
2925         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2926                                           node);
2927       if (value)
2928         {
2929           SLP_TREE_REF_COUNT (value)++;
2930           SLP_TREE_CHILDREN (root)[i] = value;
2931           /* ???  We know the original leafs of the replaced nodes will
2932              be referenced by bst_map, only the permutes created by
2933              pattern matching are not.  */
2934           if (SLP_TREE_REF_COUNT (node) == 1)
2935             load_map->remove (node);
2936           vect_free_slp_tree (node);
2937         }
2938     }
2939
2940   return NULL;
2941 }
2942
2943 /* Temporary workaround for loads not being CSEd during SLP build.  This
2944    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2945    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2946    same DR such that the final operation is equal to a permuted load.  Such
2947    NODES are then directly converted into LOADS themselves.  The nodes are
2948    CSEd using BST_MAP.  */
2949
2950 static void
2951 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2952                               vec_info *vinfo, unsigned int group_size,
2953                               hash_map<slp_tree, slp_tree> *load_map,
2954                               slp_tree root)
2955 {
2956   slp_tree node;
2957   unsigned i;
2958
2959   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2960     {
2961       slp_tree value
2962         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2963                                           node);
2964       if (value)
2965         {
2966           SLP_TREE_REF_COUNT (value)++;
2967           SLP_TREE_CHILDREN (root)[i] = value;
2968           /* ???  We know the original leafs of the replaced nodes will
2969              be referenced by bst_map, only the permutes created by
2970              pattern matching are not.  */
2971           if (SLP_TREE_REF_COUNT (node) == 1)
2972             load_map->remove (node);
2973           vect_free_slp_tree (node);
2974         }
2975     }
2976 }
2977
2978 /* Helper function of vect_match_slp_patterns.
2979
2980    Attempts to match patterns against the slp tree rooted in REF_NODE using
2981    VINFO.  Patterns are matched in post-order traversal.
2982
2983    If matching is successful the value in REF_NODE is updated and returned, if
2984    not then it is returned unchanged.  */
2985
2986 static bool
2987 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2988                            slp_tree_to_load_perm_map_t *perm_cache,
2989                            slp_compat_nodes_map_t *compat_cache,
2990                            hash_set<slp_tree> *visited)
2991 {
2992   unsigned i;
2993   slp_tree node = *ref_node;
2994   bool found_p = false;
2995   if (!node || visited->add (node))
2996     return false;
2997
2998   slp_tree child;
2999   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3000     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3001                                           vinfo, perm_cache, compat_cache,
3002                                           visited);
3003
3004   for (unsigned x = 0; x < num__slp_patterns; x++)
3005     {
3006       vect_pattern *pattern
3007         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3008       if (pattern)
3009         {
3010           pattern->build (vinfo);
3011           delete pattern;
3012           found_p = true;
3013         }
3014     }
3015
3016   return found_p;
3017 }
3018
3019 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3020    vec_info VINFO.
3021
3022    The modified tree is returned.  Patterns are tried in order and multiple
3023    patterns may match.  */
3024
3025 static bool
3026 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3027                          hash_set<slp_tree> *visited,
3028                          slp_tree_to_load_perm_map_t *perm_cache,
3029                          slp_compat_nodes_map_t *compat_cache)
3030 {
3031   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3032   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3033
3034   if (dump_enabled_p ())
3035     dump_printf_loc (MSG_NOTE, vect_location,
3036                      "Analyzing SLP tree %p for patterns\n",
3037                      (void *) SLP_INSTANCE_TREE (instance));
3038
3039   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3040                                     visited);
3041 }
3042
3043 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3044    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3045    Return true if we could use IFN_STORE_LANES instead and if that appears
3046    to be the better approach.  */
3047
3048 static bool
3049 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3050                                unsigned int group_size,
3051                                unsigned int new_group_size)
3052 {
3053   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3054   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3055   if (!vectype)
3056     return false;
3057   /* Allow the split if one of the two new groups would operate on full
3058      vectors *within* rather than across one scalar loop iteration.
3059      This is purely a heuristic, but it should work well for group
3060      sizes of 3 and 4, where the possible splits are:
3061
3062        3->2+1:  OK if the vector has exactly two elements
3063        4->2+2:  Likewise
3064        4->3+1:  Less clear-cut.  */
3065   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3066       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3067     return false;
3068   return vect_store_lanes_supported (vectype, group_size, false);
3069 }
3070
3071 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3072    vect_build_slp_tree to build a tree of packed stmts if possible.
3073    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3074
3075 static bool
3076 vect_analyze_slp_instance (vec_info *vinfo,
3077                            scalar_stmts_to_slp_tree_map_t *bst_map,
3078                            stmt_vec_info stmt_info, slp_instance_kind kind,
3079                            unsigned max_tree_size, unsigned *limit);
3080
3081 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3082    of KIND.  Return true if successful.  */
3083
3084 static bool
3085 vect_build_slp_instance (vec_info *vinfo,
3086                          slp_instance_kind kind,
3087                          vec<stmt_vec_info> &scalar_stmts,
3088                          vec<stmt_vec_info> &root_stmt_infos,
3089                          unsigned max_tree_size, unsigned *limit,
3090                          scalar_stmts_to_slp_tree_map_t *bst_map,
3091                          /* ???  We need stmt_info for group splitting.  */
3092                          stmt_vec_info stmt_info_)
3093 {
3094   if (dump_enabled_p ())
3095     {
3096       dump_printf_loc (MSG_NOTE, vect_location,
3097                        "Starting SLP discovery for\n");
3098       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3099         dump_printf_loc (MSG_NOTE, vect_location,
3100                          "  %G", scalar_stmts[i]->stmt);
3101     }
3102
3103   /* Build the tree for the SLP instance.  */
3104   unsigned int group_size = scalar_stmts.length ();
3105   bool *matches = XALLOCAVEC (bool, group_size);
3106   poly_uint64 max_nunits = 1;
3107   unsigned tree_size = 0;
3108   unsigned i;
3109   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3110                                        &max_nunits, matches, limit,
3111                                        &tree_size, bst_map);
3112   if (node != NULL)
3113     {
3114       /* Calculate the unrolling factor based on the smallest type.  */
3115       poly_uint64 unrolling_factor
3116         = calculate_unrolling_factor (max_nunits, group_size);
3117
3118       if (maybe_ne (unrolling_factor, 1U)
3119           && is_a <bb_vec_info> (vinfo))
3120         {
3121           unsigned HOST_WIDE_INT const_max_nunits;
3122           if (!max_nunits.is_constant (&const_max_nunits)
3123               || const_max_nunits > group_size)
3124             {
3125               if (dump_enabled_p ())
3126                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3127                                  "Build SLP failed: store group "
3128                                  "size not a multiple of the vector size "
3129                                  "in basic block SLP\n");
3130               vect_free_slp_tree (node);
3131               return false;
3132             }
3133           /* Fatal mismatch.  */
3134           if (dump_enabled_p ())
3135             dump_printf_loc (MSG_NOTE, vect_location,
3136                              "SLP discovery succeeded but node needs "
3137                              "splitting\n");
3138           memset (matches, true, group_size);
3139           matches[group_size / const_max_nunits * const_max_nunits] = false;
3140           vect_free_slp_tree (node);
3141         }
3142       else
3143         {
3144           /* Create a new SLP instance.  */
3145           slp_instance new_instance = XNEW (class _slp_instance);
3146           SLP_INSTANCE_TREE (new_instance) = node;
3147           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3148           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3149           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3150           SLP_INSTANCE_KIND (new_instance) = kind;
3151           new_instance->reduc_phis = NULL;
3152           new_instance->cost_vec = vNULL;
3153           new_instance->subgraph_entries = vNULL;
3154
3155           if (dump_enabled_p ())
3156             dump_printf_loc (MSG_NOTE, vect_location,
3157                              "SLP size %u vs. limit %u.\n",
3158                              tree_size, max_tree_size);
3159
3160           /* Fixup SLP reduction chains.  */
3161           if (kind == slp_inst_kind_reduc_chain)
3162             {
3163               /* If this is a reduction chain with a conversion in front
3164                  amend the SLP tree with a node for that.  */
3165               gimple *scalar_def
3166                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3167               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3168                 {
3169                   /* Get at the conversion stmt - we know it's the single use
3170                      of the last stmt of the reduction chain.  */
3171                   use_operand_p use_p;
3172                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3173                                            &use_p, &scalar_def);
3174                   gcc_assert (r);
3175                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3176                   next_info = vect_stmt_to_vectorize (next_info);
3177                   scalar_stmts = vNULL;
3178                   scalar_stmts.create (group_size);
3179                   for (unsigned i = 0; i < group_size; ++i)
3180                     scalar_stmts.quick_push (next_info);
3181                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3182                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3183                   SLP_TREE_CHILDREN (conv).quick_push (node);
3184                   SLP_INSTANCE_TREE (new_instance) = conv;
3185                   /* We also have to fake this conversion stmt as SLP reduction
3186                      group so we don't have to mess with too much code
3187                      elsewhere.  */
3188                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3189                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3190                 }
3191               /* Fill the backedge child of the PHI SLP node.  The
3192                  general matching code cannot find it because the
3193                  scalar code does not reflect how we vectorize the
3194                  reduction.  */
3195               use_operand_p use_p;
3196               imm_use_iterator imm_iter;
3197               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3198               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3199                                      gimple_get_lhs (scalar_def))
3200                 /* There are exactly two non-debug uses, the reduction
3201                    PHI and the loop-closed PHI node.  */
3202                 if (!is_gimple_debug (USE_STMT (use_p))
3203                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3204                   {
3205                     auto_vec<stmt_vec_info, 64> phis (group_size);
3206                     stmt_vec_info phi_info
3207                       = vinfo->lookup_stmt (USE_STMT (use_p));
3208                     for (unsigned i = 0; i < group_size; ++i)
3209                       phis.quick_push (phi_info);
3210                     slp_tree *phi_node = bst_map->get (phis);
3211                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3212                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3213                       = SLP_INSTANCE_TREE (new_instance);
3214                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3215                   }
3216             }
3217
3218           vinfo->slp_instances.safe_push (new_instance);
3219
3220           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3221              the number of scalar stmts in the root in a few places.
3222              Verify that assumption holds.  */
3223           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3224                         .length () == group_size);
3225
3226           if (dump_enabled_p ())
3227             {
3228               dump_printf_loc (MSG_NOTE, vect_location,
3229                                "Final SLP tree for instance %p:\n",
3230                                (void *) new_instance);
3231               vect_print_slp_graph (MSG_NOTE, vect_location,
3232                                     SLP_INSTANCE_TREE (new_instance));
3233             }
3234
3235           return true;
3236         }
3237     }
3238   else
3239     {
3240       /* Failed to SLP.  */
3241       /* Free the allocated memory.  */
3242       scalar_stmts.release ();
3243     }
3244
3245   stmt_vec_info stmt_info = stmt_info_;
3246   /* Try to break the group up into pieces.  */
3247   if (kind == slp_inst_kind_store)
3248     {
3249       /* ???  We could delay all the actual splitting of store-groups
3250          until after SLP discovery of the original group completed.
3251          Then we can recurse to vect_build_slp_instance directly.  */
3252       for (i = 0; i < group_size; i++)
3253         if (!matches[i])
3254           break;
3255
3256       /* For basic block SLP, try to break the group up into multiples of
3257          a vector size.  */
3258       if (is_a <bb_vec_info> (vinfo)
3259           && (i > 1 && i < group_size))
3260         {
3261           tree scalar_type
3262             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3263           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3264                                                       1 << floor_log2 (i));
3265           unsigned HOST_WIDE_INT const_nunits;
3266           if (vectype
3267               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3268             {
3269               /* Split into two groups at the first vector boundary.  */
3270               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3271               unsigned group1_size = i & ~(const_nunits - 1);
3272
3273               if (dump_enabled_p ())
3274                 dump_printf_loc (MSG_NOTE, vect_location,
3275                                  "Splitting SLP group at stmt %u\n", i);
3276               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3277                                                                group1_size);
3278               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3279                                                     kind, max_tree_size,
3280                                                     limit);
3281               /* Split the rest at the failure point and possibly
3282                  re-analyze the remaining matching part if it has
3283                  at least two lanes.  */
3284               if (group1_size < i
3285                   && (i + 1 < group_size
3286                       || i - group1_size > 1))
3287                 {
3288                   stmt_vec_info rest2 = rest;
3289                   rest = vect_split_slp_store_group (rest, i - group1_size);
3290                   if (i - group1_size > 1)
3291                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3292                                                       kind, max_tree_size,
3293                                                       limit);
3294                 }
3295               /* Re-analyze the non-matching tail if it has at least
3296                  two lanes.  */
3297               if (i + 1 < group_size)
3298                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3299                                                   rest, kind, max_tree_size,
3300                                                   limit);
3301               return res;
3302             }
3303         }
3304
3305       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3306       if (is_a <loop_vec_info> (vinfo)
3307           && (i > 1 && i < group_size)
3308           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3309         {
3310           unsigned group1_size = i;
3311
3312           if (dump_enabled_p ())
3313             dump_printf_loc (MSG_NOTE, vect_location,
3314                              "Splitting SLP group at stmt %u\n", i);
3315
3316           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3317                                                            group1_size);
3318           /* Loop vectorization cannot handle gaps in stores, make sure
3319              the split group appears as strided.  */
3320           STMT_VINFO_STRIDED_P (rest) = 1;
3321           DR_GROUP_GAP (rest) = 0;
3322           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3323           DR_GROUP_GAP (stmt_info) = 0;
3324
3325           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3326                                                 kind, max_tree_size, limit);
3327           if (i + 1 < group_size)
3328             res |= vect_analyze_slp_instance (vinfo, bst_map,
3329                                               rest, kind, max_tree_size, limit);
3330
3331           return res;
3332         }
3333
3334       /* Even though the first vector did not all match, we might be able to SLP
3335          (some) of the remainder.  FORNOW ignore this possibility.  */
3336     }
3337
3338   /* Failed to SLP.  */
3339   if (dump_enabled_p ())
3340     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3341   return false;
3342 }
3343
3344
3345 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3346    vect_build_slp_tree to build a tree of packed stmts if possible.
3347    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3348
3349 static bool
3350 vect_analyze_slp_instance (vec_info *vinfo,
3351                            scalar_stmts_to_slp_tree_map_t *bst_map,
3352                            stmt_vec_info stmt_info,
3353                            slp_instance_kind kind,
3354                            unsigned max_tree_size, unsigned *limit)
3355 {
3356   unsigned int i;
3357   vec<stmt_vec_info> scalar_stmts;
3358
3359   if (is_a <bb_vec_info> (vinfo))
3360     vect_location = stmt_info->stmt;
3361
3362   stmt_vec_info next_info = stmt_info;
3363   if (kind == slp_inst_kind_store)
3364     {
3365       /* Collect the stores and store them in scalar_stmts.  */
3366       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3367       while (next_info)
3368         {
3369           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3370           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3371         }
3372     }
3373   else if (kind == slp_inst_kind_reduc_chain)
3374     {
3375       /* Collect the reduction stmts and store them in scalar_stmts.  */
3376       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3377       while (next_info)
3378         {
3379           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3380           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3381         }
3382       /* Mark the first element of the reduction chain as reduction to properly
3383          transform the node.  In the reduction analysis phase only the last
3384          element of the chain is marked as reduction.  */
3385       STMT_VINFO_DEF_TYPE (stmt_info)
3386         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3387       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3388         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3389     }
3390   else if (kind == slp_inst_kind_ctor)
3391     {
3392       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3393       tree val;
3394       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3395       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3396         {
3397           stmt_vec_info def_info = vinfo->lookup_def (val);
3398           def_info = vect_stmt_to_vectorize (def_info);
3399           scalar_stmts.quick_push (def_info);
3400         }
3401       if (dump_enabled_p ())
3402         dump_printf_loc (MSG_NOTE, vect_location,
3403                          "Analyzing vectorizable constructor: %G\n",
3404                          stmt_info->stmt);
3405     }
3406   else if (kind == slp_inst_kind_reduc_group)
3407     {
3408       /* Collect reduction statements.  */
3409       const vec<stmt_vec_info> &reductions
3410         = as_a <loop_vec_info> (vinfo)->reductions;
3411       scalar_stmts.create (reductions.length ());
3412       for (i = 0; reductions.iterate (i, &next_info); i++)
3413         if ((STMT_VINFO_RELEVANT_P (next_info)
3414              || STMT_VINFO_LIVE_P (next_info))
3415             /* ???  Make sure we didn't skip a conversion around a reduction
3416                path.  In that case we'd have to reverse engineer that conversion
3417                stmt following the chain using reduc_idx and from the PHI
3418                using reduc_def.  */
3419             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3420           scalar_stmts.quick_push (next_info);
3421       /* If less than two were relevant/live there's nothing to SLP.  */
3422       if (scalar_stmts.length () < 2)
3423         return false;
3424     }
3425   else
3426     gcc_unreachable ();
3427
3428   vec<stmt_vec_info> roots = vNULL;
3429   if (kind == slp_inst_kind_ctor)
3430     {
3431       roots.create (1);
3432       roots.quick_push (stmt_info);
3433     }
3434   /* Build the tree for the SLP instance.  */
3435   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3436                                       roots,
3437                                       max_tree_size, limit, bst_map,
3438                                       kind == slp_inst_kind_store
3439                                       ? stmt_info : NULL);
3440   if (!res)
3441     roots.release ();
3442
3443   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3444      where we should do store group splitting.  */
3445
3446   return res;
3447 }
3448
3449 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3450    trees of packed scalar stmts if SLP is possible.  */
3451
3452 opt_result
3453 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3454 {
3455   unsigned int i;
3456   stmt_vec_info first_element;
3457   slp_instance instance;
3458
3459   DUMP_VECT_SCOPE ("vect_analyze_slp");
3460
3461   unsigned limit = max_tree_size;
3462
3463   scalar_stmts_to_slp_tree_map_t *bst_map
3464     = new scalar_stmts_to_slp_tree_map_t ();
3465
3466   /* Find SLP sequences starting from groups of grouped stores.  */
3467   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3468     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3469                                STMT_VINFO_GROUPED_ACCESS (first_element)
3470                                ? slp_inst_kind_store : slp_inst_kind_ctor,
3471                                max_tree_size, &limit);
3472
3473   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3474     {
3475       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3476         {
3477           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3478           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3479                                        bb_vinfo->roots[i].stmts,
3480                                        bb_vinfo->roots[i].roots,
3481                                        max_tree_size, &limit, bst_map, NULL))
3482             {
3483               bb_vinfo->roots[i].stmts = vNULL;
3484               bb_vinfo->roots[i].roots = vNULL;
3485             }
3486         }
3487     }
3488
3489   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3490     {
3491       /* Find SLP sequences starting from reduction chains.  */
3492       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3493         if (! STMT_VINFO_RELEVANT_P (first_element)
3494             && ! STMT_VINFO_LIVE_P (first_element))
3495           ;
3496         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3497                                               slp_inst_kind_reduc_chain,
3498                                               max_tree_size, &limit))
3499           {
3500             /* Dissolve reduction chain group.  */
3501             stmt_vec_info vinfo = first_element;
3502             stmt_vec_info last = NULL;
3503             while (vinfo)
3504               {
3505                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3506                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3507                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3508                 last = vinfo;
3509                 vinfo = next;
3510               }
3511             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3512             /* It can be still vectorized as part of an SLP reduction.  */
3513             loop_vinfo->reductions.safe_push (last);
3514           }
3515
3516       /* Find SLP sequences starting from groups of reductions.  */
3517       if (loop_vinfo->reductions.length () > 1)
3518         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3519                                    slp_inst_kind_reduc_group, max_tree_size,
3520                                    &limit);
3521     }
3522
3523   hash_set<slp_tree> visited_patterns;
3524   slp_tree_to_load_perm_map_t perm_cache;
3525   slp_compat_nodes_map_t compat_cache;
3526
3527   /* See if any patterns can be found in the SLP tree.  */
3528   bool pattern_found = false;
3529   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3530     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3531                                               &visited_patterns, &perm_cache,
3532                                               &compat_cache);
3533
3534   /* If any were found optimize permutations of loads.  */
3535   if (pattern_found)
3536     {
3537       hash_map<slp_tree, slp_tree> load_map;
3538       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3539         {
3540           slp_tree root = SLP_INSTANCE_TREE (instance);
3541           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3542                                         &load_map, root);
3543         }
3544     }
3545
3546
3547
3548   /* The map keeps a reference on SLP nodes built, release that.  */
3549   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3550        it != bst_map->end (); ++it)
3551     if ((*it).second)
3552       vect_free_slp_tree ((*it).second);
3553   delete bst_map;
3554
3555   if (pattern_found && dump_enabled_p ())
3556     {
3557       dump_printf_loc (MSG_NOTE, vect_location,
3558                        "Pattern matched SLP tree\n");
3559       hash_set<slp_tree> visited;
3560       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3561         vect_print_slp_graph (MSG_NOTE, vect_location,
3562                               SLP_INSTANCE_TREE (instance), visited);
3563     }
3564
3565   return opt_result::success ();
3566 }
3567
3568 /* Estimates the cost of inserting layout changes into the SLP graph.
3569    It can also say that the insertion is impossible.  */
3570
3571 struct slpg_layout_cost
3572 {
3573   slpg_layout_cost () = default;
3574   slpg_layout_cost (sreal, bool);
3575
3576   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3577   bool is_possible () const { return depth != sreal::max (); }
3578
3579   bool operator== (const slpg_layout_cost &) const;
3580   bool operator!= (const slpg_layout_cost &) const;
3581
3582   bool is_better_than (const slpg_layout_cost &, bool) const;
3583
3584   void add_parallel_cost (const slpg_layout_cost &);
3585   void add_serial_cost (const slpg_layout_cost &);
3586   void split (unsigned int);
3587
3588   /* The longest sequence of layout changes needed during any traversal
3589      of the partition dag, weighted by execution frequency.
3590
3591      This is the most important metric when optimizing for speed, since
3592      it helps to ensure that we keep the number of operations on
3593      critical paths to a minimum.  */
3594   sreal depth = 0;
3595
3596   /* An estimate of the total number of operations needed.  It is weighted by
3597      execution frequency when optimizing for speed but not when optimizing for
3598      size.  In order to avoid double-counting, a node with a fanout of N will
3599      distribute 1/N of its total cost to each successor.
3600
3601      This is the most important metric when optimizing for size, since
3602      it helps to keep the total number of operations to a minimum,  */
3603   sreal total = 0;
3604 };
3605
3606 /* Construct costs for a node with weight WEIGHT.  A higher weight
3607    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3608    optimizing for size rather than speed.  */
3609
3610 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3611   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3612 {
3613 }
3614
3615 bool
3616 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3617 {
3618   return depth == other.depth && total == other.total;
3619 }
3620
3621 bool
3622 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3623 {
3624   return !operator== (other);
3625 }
3626
3627 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3628    true if we are optimizing for size rather than speed.  */
3629
3630 bool
3631 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3632                                   bool is_for_size) const
3633 {
3634   if (is_for_size)
3635     {
3636       if (total != other.total)
3637         return total < other.total;
3638       return depth < other.depth;
3639     }
3640   else
3641     {
3642       if (depth != other.depth)
3643         return depth < other.depth;
3644       return total < other.total;
3645     }
3646 }
3647
3648 /* Increase the costs to account for something with cost INPUT_COST
3649    happening in parallel with the current costs.  */
3650
3651 void
3652 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3653 {
3654   depth = std::max (depth, input_cost.depth);
3655   total += input_cost.total;
3656 }
3657
3658 /* Increase the costs to account for something with cost INPUT_COST
3659    happening in series with the current costs.  */
3660
3661 void
3662 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3663 {
3664   depth += other.depth;
3665   total += other.total;
3666 }
3667
3668 /* Split the total cost among TIMES successors or predecessors.  */
3669
3670 void
3671 slpg_layout_cost::split (unsigned int times)
3672 {
3673   if (times > 1)
3674     total /= times;
3675 }
3676
3677 /* Information about one node in the SLP graph, for use during
3678    vect_optimize_slp_pass.  */
3679
3680 struct slpg_vertex
3681 {
3682   slpg_vertex (slp_tree node_) : node (node_) {}
3683
3684   /* The node itself.  */
3685   slp_tree node;
3686
3687   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3688      partitions are flexible; they can have whichever layout consumers
3689      want them to have.  */
3690   int partition = -1;
3691
3692   /* The number of nodes that directly use the result of this one
3693      (i.e. the number of nodes that count this one as a child).  */
3694   unsigned int out_degree = 0;
3695
3696   /* The execution frequency of the node.  */
3697   sreal weight = 0;
3698
3699   /* The total execution frequency of all nodes that directly use the
3700      result of this one.  */
3701   sreal out_weight = 0;
3702 };
3703
3704 /* Information about one partition of the SLP graph, for use during
3705    vect_optimize_slp_pass.  */
3706
3707 struct slpg_partition_info
3708 {
3709   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3710      of m_partitioned_nodes.  */
3711   unsigned int node_begin = 0;
3712   unsigned int node_end = 0;
3713
3714   /* Which layout we've chosen to use for this partition, or -1 if
3715      we haven't picked one yet.  */
3716   int layout = -1;
3717
3718   /* The number of predecessors and successors in the partition dag.
3719      The predecessors always have lower partition numbers and the
3720      successors always have higher partition numbers.
3721
3722      Note that the directions of these edges are not necessarily the
3723      same as in the data flow graph.  For example, if an SCC has separate
3724      partitions for an inner loop and an outer loop, the inner loop's
3725      partition will have at least two incoming edges from the outer loop's
3726      partition: one for a live-in value and one for a live-out value.
3727      In data flow terms, one of these edges would also be from the outer loop
3728      to the inner loop, but the other would be in the opposite direction.  */
3729   unsigned int in_degree = 0;
3730   unsigned int out_degree = 0;
3731 };
3732
3733 /* Information about the costs of using a particular layout for a
3734    particular partition.  It can also say that the combination is
3735    impossible.  */
3736
3737 struct slpg_partition_layout_costs
3738 {
3739   bool is_possible () const { return internal_cost.is_possible (); }
3740   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3741
3742   /* The costs inherited from predecessor partitions.  */
3743   slpg_layout_cost in_cost;
3744
3745   /* The inherent cost of the layout within the node itself.  For example,
3746      this is nonzero for a load if choosing a particular layout would require
3747      the load to permute the loaded elements.  It is nonzero for a
3748      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3749      to full-vector moves.  */
3750   slpg_layout_cost internal_cost;
3751
3752   /* The costs inherited from successor partitions.  */
3753   slpg_layout_cost out_cost;
3754 };
3755
3756 /* This class tries to optimize the layout of vectors in order to avoid
3757    unnecessary shuffling.  At the moment, the set of possible layouts are
3758    restricted to bijective permutations.
3759
3760    The goal of the pass depends on whether we're optimizing for size or
3761    for speed.  When optimizing for size, the goal is to reduce the overall
3762    number of layout changes (including layout changes implied by things
3763    like load permutations).  When optimizing for speed, the goal is to
3764    reduce the maximum latency attributable to layout changes on any
3765    non-cyclical path through the data flow graph.
3766
3767    For example, when optimizing a loop nest for speed, we will prefer
3768    to make layout changes outside of a loop rather than inside of a loop,
3769    and will prefer to make layout changes in parallel rather than serially,
3770    even if that increases the overall number of layout changes.
3771
3772    The high-level procedure is:
3773
3774    (1) Build a graph in which edges go from uses (parents) to definitions
3775        (children).
3776
3777    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3778
3779    (3) When optimizing for speed, partition the nodes in each SCC based
3780        on their containing cfg loop.  When optimizing for size, treat
3781        each SCC as a single partition.
3782
3783        This gives us a dag of partitions.  The goal is now to assign a
3784        layout to each partition.
3785
3786    (4) Construct a set of vector layouts that are worth considering.
3787        Record which nodes must keep their current layout.
3788
3789    (5) Perform a forward walk over the partition dag (from loads to stores)
3790        accumulating the "forward" cost of using each layout.  When visiting
3791        each partition, assign a tentative choice of layout to the partition
3792        and use that choice when calculating the cost of using a different
3793        layout in successor partitions.
3794
3795    (6) Perform a backward walk over the partition dag (from stores to loads),
3796        accumulating the "backward" cost of using each layout.  When visiting
3797        each partition, make a final choice of layout for that partition based
3798        on the accumulated forward costs (from (5)) and backward costs
3799        (from (6)).
3800
3801    (7) Apply the chosen layouts to the SLP graph.
3802
3803    For example, consider the SLP statements:
3804
3805    S1:      a_1 = load
3806        loop:
3807    S2:      a_2 = PHI<a_1, a_3>
3808    S3:      b_1 = load
3809    S4:      a_3 = a_2 + b_1
3810        exit:
3811    S5:      a_4 = PHI<a_3>
3812    S6:      store a_4
3813
3814    S2 and S4 form an SCC and are part of the same loop.  Every other
3815    statement is in a singleton SCC.  In this example there is a one-to-one
3816    mapping between SCCs and partitions and the partition dag looks like this;
3817
3818         S1     S3
3819          \     /
3820           S2+S4
3821             |
3822            S5
3823             |
3824            S6
3825
3826    S2, S3 and S4 will have a higher execution frequency than the other
3827    statements, so when optimizing for speed, the goal is to avoid any
3828    layout changes:
3829
3830    - within S3
3831    - within S2+S4
3832    - on the S3->S2+S4 edge
3833
3834    For example, if S3 was originally a reversing load, the goal of the
3835    pass is to make it an unreversed load and change the layout on the
3836    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3837    on S1->S2+S4 and S5->S6 would also be acceptable.)
3838
3839    The difference between SCCs and partitions becomes important if we
3840    add an outer loop:
3841
3842    S1:      a_1 = ...
3843        loop1:
3844    S2:      a_2 = PHI<a_1, a_6>
3845    S3:      b_1 = load
3846    S4:      a_3 = a_2 + b_1
3847        loop2:
3848    S5:      a_4 = PHI<a_3, a_5>
3849    S6:      c_1 = load
3850    S7:      a_5 = a_4 + c_1
3851        exit2:
3852    S8:      a_6 = PHI<a_5>
3853    S9:      store a_6
3854        exit1:
3855
3856    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3857    for speed, we usually do not want restrictions in the outer loop to "infect"
3858    the decision for the inner loop.  For example, if an outer-loop node
3859    in the SCC contains a statement with a fixed layout, that should not
3860    prevent the inner loop from using a different layout.  Conversely,
3861    the inner loop should not dictate a layout to the outer loop: if the
3862    outer loop does a lot of computation, then it may not be efficient to
3863    do all of that computation in the inner loop's preferred layout.
3864
3865    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3866    and S5+S7 (inner).  We also try to arrange partitions so that:
3867
3868    - the partition for an outer loop comes before the partition for
3869      an inner loop
3870
3871    - if a sibling loop A dominates a sibling loop B, A's partition
3872      comes before B's
3873
3874    This gives the following partition dag for the example above:
3875
3876         S1        S3
3877          \        /
3878           S2+S4+S8   S6
3879            |   \\    /
3880            |    S5+S7
3881            |
3882           S9
3883
3884    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3885    one for a reversal of the edge S7->S8.
3886
3887    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3888    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3889    preferred layout against the cost of changing the layout on entry to the
3890    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3891
3892    Although this works well when optimizing for speed, it has the downside
3893    when optimizing for size that the choice of layout for S5+S7 is completely
3894    independent of S9, which lessens the chance of reducing the overall number
3895    of permutations.  We therefore do not partition SCCs when optimizing
3896    for size.
3897
3898    To give a concrete example of the difference between optimizing
3899    for size and speed, consider:
3900
3901    a[0] = (b[1] << c[3]) - d[1];
3902    a[1] = (b[0] << c[2]) - d[0];
3903    a[2] = (b[3] << c[1]) - d[3];
3904    a[3] = (b[2] << c[0]) - d[2];
3905
3906    There are three different layouts here: one for a, one for b and d,
3907    and one for c.  When optimizing for speed it is better to permute each
3908    of b, c and d into the order required by a, since those permutations
3909    happen in parallel.  But when optimizing for size, it is better to:
3910
3911    - permute c into the same order as b
3912    - do the arithmetic
3913    - permute the result into the order required by a
3914
3915    This gives 2 permutations rather than 3.  */
3916
3917 class vect_optimize_slp_pass
3918 {
3919 public:
3920   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3921   void run ();
3922
3923 private:
3924   /* Graph building.  */
3925   struct loop *containing_loop (slp_tree);
3926   bool is_cfg_latch_edge (graph_edge *);
3927   void build_vertices (hash_set<slp_tree> &, slp_tree);
3928   void build_vertices ();
3929   void build_graph ();
3930
3931   /* Partitioning.  */
3932   void create_partitions ();
3933   template<typename T> void for_each_partition_edge (unsigned int, T);
3934
3935   /* Layout selection.  */
3936   bool is_compatible_layout (slp_tree, unsigned int);
3937   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3938   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3939                                                        unsigned int);
3940   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3941                                int, unsigned int);
3942   int internal_node_cost (slp_tree, int, unsigned int);
3943   void start_choosing_layouts ();
3944
3945   /* Cost propagation.  */
3946   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3947                                      unsigned int, unsigned int);
3948   slpg_layout_cost total_in_cost (unsigned int);
3949   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3950   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3951   void forward_pass ();
3952   void backward_pass ();
3953
3954   /* Rematerialization.  */
3955   slp_tree get_result_with_layout (slp_tree, unsigned int);
3956   void materialize ();
3957
3958   /* Clean-up.  */
3959   void remove_redundant_permutations ();
3960
3961   void dump ();
3962
3963   vec_info *m_vinfo;
3964
3965   /* True if we should optimize the graph for size, false if we should
3966      optimize it for speed.  (It wouldn't be easy to make this decision
3967      more locally.)  */
3968   bool m_optimize_size;
3969
3970   /* A graph of all SLP nodes, with edges leading from uses to definitions.
3971      In other words, a node's predecessors are its slp_tree parents and
3972      a node's successors are its slp_tree children.  */
3973   graph *m_slpg = nullptr;
3974
3975   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
3976   auto_vec<slpg_vertex> m_vertices;
3977
3978   /* The list of all leaves of M_SLPG. such as external definitions, constants,
3979      and loads.  */
3980   auto_vec<int> m_leafs;
3981
3982   /* This array has one entry for every vector layout that we're considering.
3983      Element 0 is null and indicates "no change".  Other entries describe
3984      permutations that are inherent in the current graph and that we would
3985      like to reverse if possible.
3986
3987      For example, a permutation { 1, 2, 3, 0 } means that something has
3988      effectively been permuted in that way, such as a load group
3989      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3990      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3991      in order to put things "back" in order.  */
3992   auto_vec<vec<unsigned> > m_perms;
3993
3994   /* A partitioning of the nodes for which a layout must be chosen.
3995      Each partition represents an <SCC, cfg loop> pair; that is,
3996      nodes in different SCCs belong to different partitions, and nodes
3997      within an SCC can be further partitioned according to a containing
3998      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
3999
4000      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4001        from leaves (such as loads) to roots (such as stores).
4002
4003      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4004   auto_vec<slpg_partition_info> m_partitions;
4005
4006   /* The list of all nodes for which a layout must be chosen.  Nodes for
4007      partition P come before the nodes for partition P+1.  Nodes within a
4008      partition are in reverse postorder.  */
4009   auto_vec<unsigned int> m_partitioned_nodes;
4010
4011   /* Index P * num-layouts + L contains the cost of using layout L
4012      for partition P.  */
4013   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4014
4015   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4016      original output of node N adjusted to have layout L.  */
4017   auto_vec<slp_tree> m_node_layouts;
4018 };
4019
4020 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4021    Also record whether we should optimize anything for speed rather
4022    than size.  */
4023
4024 void
4025 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4026                                         slp_tree node)
4027 {
4028   unsigned i;
4029   slp_tree child;
4030
4031   if (visited.add (node))
4032     return;
4033
4034   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4035     {
4036       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4037       if (optimize_bb_for_speed_p (bb))
4038         m_optimize_size = false;
4039     }
4040
4041   node->vertex = m_vertices.length ();
4042   m_vertices.safe_push (slpg_vertex (node));
4043
4044   bool leaf = true;
4045   bool force_leaf = false;
4046   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4047     if (child)
4048       {
4049         leaf = false;
4050         build_vertices (visited, child);
4051       }
4052     else
4053       force_leaf = true;
4054   /* Since SLP discovery works along use-def edges all cycles have an
4055      entry - but there's the exception of cycles where we do not handle
4056      the entry explicitely (but with a NULL SLP node), like some reductions
4057      and inductions.  Force those SLP PHIs to act as leafs to make them
4058      backwards reachable.  */
4059   if (leaf || force_leaf)
4060     m_leafs.safe_push (node->vertex);
4061 }
4062
4063 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4064
4065 void
4066 vect_optimize_slp_pass::build_vertices ()
4067 {
4068   hash_set<slp_tree> visited;
4069   unsigned i;
4070   slp_instance instance;
4071   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4072     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4073 }
4074
4075 /* Apply (reverse) bijectite PERM to VEC.  */
4076
4077 template <class T>
4078 static void
4079 vect_slp_permute (vec<unsigned> perm,
4080                   vec<T> &vec, bool reverse)
4081 {
4082   auto_vec<T, 64> saved;
4083   saved.create (vec.length ());
4084   for (unsigned i = 0; i < vec.length (); ++i)
4085     saved.quick_push (vec[i]);
4086
4087   if (reverse)
4088     {
4089       for (unsigned i = 0; i < vec.length (); ++i)
4090         vec[perm[i]] = saved[i];
4091       for (unsigned i = 0; i < vec.length (); ++i)
4092         gcc_assert (vec[perm[i]] == saved[i]);
4093     }
4094   else
4095     {
4096       for (unsigned i = 0; i < vec.length (); ++i)
4097         vec[i] = saved[perm[i]];
4098       for (unsigned i = 0; i < vec.length (); ++i)
4099         gcc_assert (vec[i] == saved[perm[i]]);
4100     }
4101 }
4102
4103 /* Return the cfg loop that contains NODE.  */
4104
4105 struct loop *
4106 vect_optimize_slp_pass::containing_loop (slp_tree node)
4107 {
4108   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4109   if (!rep)
4110     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4111   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4112 }
4113
4114 /* Return true if UD (an edge from a use to a definition) is associated
4115    with a loop latch edge in the cfg.  */
4116
4117 bool
4118 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4119 {
4120   slp_tree use = m_vertices[ud->src].node;
4121   slp_tree def = m_vertices[ud->dest].node;
4122   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4123       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4124     return false;
4125
4126   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4127   return (is_a<gphi *> (use_rep->stmt)
4128           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4129           && containing_loop (def) == containing_loop (use));
4130 }
4131
4132 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4133    a nonnull data field.  */
4134
4135 void
4136 vect_optimize_slp_pass::build_graph ()
4137 {
4138   m_optimize_size = true;
4139   build_vertices ();
4140
4141   m_slpg = new_graph (m_vertices.length ());
4142   for (slpg_vertex &v : m_vertices)
4143     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4144       if (child)
4145         {
4146           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4147           if (is_cfg_latch_edge (ud))
4148             ud->data = this;
4149         }
4150 }
4151
4152 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4153
4154 static bool
4155 skip_cfg_latch_edges (graph_edge *e)
4156 {
4157   return e->data;
4158 }
4159
4160 /* Create the node partitions.  */
4161
4162 void
4163 vect_optimize_slp_pass::create_partitions ()
4164 {
4165   /* Calculate a postorder of the graph, ignoring edges that correspond
4166      to natural latch edges in the cfg.  Reading the vector from the end
4167      to the beginning gives the reverse postorder.  */
4168   auto_vec<int> initial_rpo;
4169   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4170                false, NULL, skip_cfg_latch_edges);
4171   gcc_assert (initial_rpo.length () == m_vertices.length ());
4172
4173   /* Calculate the strongly connected components of the graph.  */
4174   auto_vec<int> scc_grouping;
4175   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4176
4177   /* Create a new index order in which all nodes from the same SCC are
4178      consecutive.  Use scc_pos to record the index of the first node in
4179      each SCC.  */
4180   auto_vec<unsigned int> scc_pos (num_sccs);
4181   int last_component = -1;
4182   unsigned int node_count = 0;
4183   for (unsigned int node_i : scc_grouping)
4184     {
4185       if (last_component != m_slpg->vertices[node_i].component)
4186         {
4187           last_component = m_slpg->vertices[node_i].component;
4188           gcc_assert (last_component == int (scc_pos.length ()));
4189           scc_pos.quick_push (node_count);
4190         }
4191       node_count += 1;
4192     }
4193   gcc_assert (node_count == initial_rpo.length ()
4194               && last_component + 1 == int (num_sccs));
4195
4196   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4197      inside each SCC following the RPO we calculated above.  The fact that
4198      we ignored natural latch edges when calculating the RPO should ensure
4199      that, for natural loop nests:
4200
4201      - the first node that we encounter in a cfg loop is the loop header phi
4202      - the loop header phis are in dominance order
4203
4204      Arranging for this is an optimization (see below) rather than a
4205      correctness issue.  Unnatural loops with a tangled mess of backedges
4206      will still work correctly, but might give poorer results.
4207
4208      Also update scc_pos so that it gives 1 + the index of the last node
4209      in the SCC.  */
4210   m_partitioned_nodes.safe_grow (node_count);
4211   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4212     {
4213       unsigned int node_i = initial_rpo[old_i];
4214       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4215       m_partitioned_nodes[new_i] = node_i;
4216     }
4217
4218   /* When optimizing for speed, partition each SCC based on the containing
4219      cfg loop. The order we constructed above should ensure that, for natural
4220      cfg loops, we'll create sub-SCC partitions for outer loops before
4221      the corresponding sub-SCC partitions for inner loops.  Similarly,
4222      when one sibling loop A dominates another sibling loop B, we should
4223      create a sub-SCC partition for A before a sub-SCC partition for B.
4224
4225      As above, nothing depends for correctness on whether this achieves
4226      a natural nesting, but we should get better results when it does.  */
4227   m_partitions.reserve (m_vertices.length ());
4228   unsigned int next_partition_i = 0;
4229   hash_map<struct loop *, int> loop_partitions;
4230   unsigned int rpo_begin = 0;
4231   unsigned int num_partitioned_nodes = 0;
4232   for (unsigned int rpo_end : scc_pos)
4233     {
4234       loop_partitions.empty ();
4235       unsigned int partition_i = next_partition_i;
4236       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4237         {
4238           /* Handle externals and constants optimistically throughout.
4239              But treat existing vectors as fixed since we do not handle
4240              permuting them.  */
4241           unsigned int node_i = m_partitioned_nodes[rpo_i];
4242           auto &vertex = m_vertices[node_i];
4243           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4244                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4245               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4246             vertex.partition = -1;
4247           else
4248             {
4249               bool existed;
4250               if (m_optimize_size)
4251                 existed = next_partition_i > partition_i;
4252               else
4253                 {
4254                   struct loop *loop = containing_loop (vertex.node);
4255                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4256                   if (!existed)
4257                     entry = next_partition_i;
4258                   partition_i = entry;
4259                 }
4260               if (!existed)
4261                 {
4262                   m_partitions.quick_push (slpg_partition_info ());
4263                   next_partition_i += 1;
4264                 }
4265               vertex.partition = partition_i;
4266               num_partitioned_nodes += 1;
4267               m_partitions[partition_i].node_end += 1;
4268             }
4269         }
4270       rpo_begin = rpo_end;
4271     }
4272
4273   /* Assign ranges of consecutive node indices to each partition,
4274      in partition order.  Start with node_end being the same as
4275      node_begin so that the next loop can use it as a counter.  */
4276   unsigned int node_begin = 0;
4277   for (auto &partition : m_partitions)
4278     {
4279       partition.node_begin = node_begin;
4280       node_begin += partition.node_end;
4281       partition.node_end = partition.node_begin;
4282     }
4283   gcc_assert (node_begin == num_partitioned_nodes);
4284
4285   /* Finally build the list of nodes in partition order.  */
4286   m_partitioned_nodes.truncate (num_partitioned_nodes);
4287   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4288     {
4289       int partition_i = m_vertices[node_i].partition;
4290       if (partition_i >= 0)
4291         {
4292           unsigned int order_i = m_partitions[partition_i].node_end++;
4293           m_partitioned_nodes[order_i] = node_i;
4294         }
4295     }
4296 }
4297
4298 /* Look for edges from earlier partitions into node NODE_I and edges from
4299    node NODE_I into later partitions.  Call:
4300
4301       FN (ud, other_node_i)
4302
4303    for each such use-to-def edge ud, where other_node_i is the node at the
4304    other end of the edge.  */
4305
4306 template<typename T>
4307 void
4308 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4309 {
4310   int partition_i = m_vertices[node_i].partition;
4311   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4312        pred; pred = pred->pred_next)
4313     {
4314       int src_partition_i = m_vertices[pred->src].partition;
4315       if (src_partition_i >= 0 && src_partition_i != partition_i)
4316         fn (pred, pred->src);
4317     }
4318   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4319        succ; succ = succ->succ_next)
4320     {
4321       int dest_partition_i = m_vertices[succ->dest].partition;
4322       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4323         fn (succ, succ->dest);
4324     }
4325 }
4326
4327 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4328    that NODE would operate on.  This test is independent of NODE's actual
4329    operation.  */
4330
4331 bool
4332 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4333                                               unsigned int layout_i)
4334 {
4335   if (layout_i == 0)
4336     return true;
4337
4338   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4339     return false;
4340
4341   return true;
4342 }
4343
4344 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4345    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4346    layouts is incompatible with NODE or if the change is not possible for
4347    some other reason.
4348
4349    The properties taken from NODE include the number of lanes and the
4350    vector type.  The actual operation doesn't matter.  */
4351
4352 int
4353 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4354                                             unsigned int from_layout_i,
4355                                             unsigned int to_layout_i)
4356 {
4357   if (!is_compatible_layout (node, from_layout_i)
4358       || !is_compatible_layout (node, to_layout_i))
4359     return -1;
4360
4361   if (from_layout_i == to_layout_i)
4362     return 0;
4363
4364   auto_vec<slp_tree, 1> children (1);
4365   children.quick_push (node);
4366   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4367   if (from_layout_i > 0)
4368     for (unsigned int i : m_perms[from_layout_i])
4369       perm.quick_push ({ 0, i });
4370   else
4371     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4372       perm.quick_push ({ 0, i });
4373   if (to_layout_i > 0)
4374     vect_slp_permute (m_perms[to_layout_i], perm, true);
4375   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4376                                                children, false);
4377   if (count >= 0)
4378     return MAX (count, 1);
4379
4380   /* ??? In principle we could try changing via layout 0, giving two
4381      layout changes rather than 1.  Doing that would require
4382      corresponding support in get_result_with_layout.  */
4383   return -1;
4384 }
4385
4386 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4387
4388 inline slpg_partition_layout_costs &
4389 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4390                                                 unsigned int layout_i)
4391 {
4392   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4393 }
4394
4395 /* Change PERM in one of two ways:
4396
4397    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4398      chosen for child I of NODE.
4399
4400    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4401
4402    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4403
4404 void
4405 vect_optimize_slp_pass::
4406 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4407                         int in_layout_i, unsigned int out_layout_i)
4408 {
4409   for (auto &entry : perm)
4410     {
4411       int this_in_layout_i = in_layout_i;
4412       if (this_in_layout_i < 0)
4413         {
4414           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4415           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4416           this_in_layout_i = m_partitions[in_partition_i].layout;
4417         }
4418       if (this_in_layout_i > 0)
4419         entry.second = m_perms[this_in_layout_i][entry.second];
4420     }
4421   if (out_layout_i > 0)
4422     vect_slp_permute (m_perms[out_layout_i], perm, true);
4423 }
4424
4425 /* Check whether the target allows NODE to be rearranged so that the node's
4426    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4427    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4428
4429    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4430    NODE can adapt to the layout changes that have (perhaps provisionally)
4431    been chosen for NODE's children, so that no extra permutations are
4432    needed on either the input or the output of NODE.
4433
4434    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4435    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4436
4437    IN_LAYOUT_I has no meaning for other types of node.
4438
4439    Keeping the node as-is is always valid.  If the target doesn't appear
4440    to support the node as-is, but might realistically support other layouts,
4441    then layout 0 instead has the cost of a worst-case permutation.  On the
4442    one hand, this ensures that every node has at least one valid layout,
4443    avoiding what would otherwise be an awkward special case.  On the other,
4444    it still encourages the pass to change an invalid pre-existing layout
4445    choice into a valid one.  */
4446
4447 int
4448 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4449                                             unsigned int out_layout_i)
4450 {
4451   const int fallback_cost = 1;
4452
4453   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4454     {
4455       auto_lane_permutation_t tmp_perm;
4456       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4457
4458       /* Check that the child nodes support the chosen layout.  Checking
4459          the first child is enough, since any second child would have the
4460          same shape.  */
4461       auto first_child = SLP_TREE_CHILDREN (node)[0];
4462       if (in_layout_i > 0
4463           && !is_compatible_layout (first_child, in_layout_i))
4464         return -1;
4465
4466       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4467       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4468                                                   node, tmp_perm,
4469                                                   SLP_TREE_CHILDREN (node),
4470                                                   false);
4471       if (count < 0)
4472         {
4473           if (in_layout_i == 0 && out_layout_i == 0)
4474             {
4475               /* Use the fallback cost if the node could in principle support
4476                  some nonzero layout for both the inputs and the outputs.
4477                  Otherwise assume that the node will be rejected later
4478                  and rebuilt from scalars.  */
4479               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4480                 return fallback_cost;
4481               return 0;
4482             }
4483           return -1;
4484         }
4485
4486       /* We currently have no way of telling whether the new layout is cheaper
4487          or more expensive than the old one.  But at least in principle,
4488          it should be worth making zero permutations (whole-vector shuffles)
4489          cheaper than real permutations, in case the pass is able to remove
4490          the latter.  */
4491       return count == 0 ? 0 : 1;
4492     }
4493
4494   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4495   if (rep
4496       && STMT_VINFO_DATA_REF (rep)
4497       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4498       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4499     {
4500       auto_load_permutation_t tmp_perm;
4501       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4502       if (out_layout_i > 0)
4503         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4504
4505       poly_uint64 vf = 1;
4506       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4507         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4508       unsigned int n_perms;
4509       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4510                                            nullptr, vf, true, false, &n_perms))
4511         {
4512           auto rep = SLP_TREE_REPRESENTATIVE (node);
4513           if (out_layout_i == 0)
4514             {
4515               /* Use the fallback cost if the load is an N-to-N permutation.
4516                  Otherwise assume that the node will be rejected later
4517                  and rebuilt from scalars.  */
4518               if (STMT_VINFO_GROUPED_ACCESS (rep)
4519                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4520                       == SLP_TREE_LANES (node)))
4521                 return fallback_cost;
4522               return 0;
4523             }
4524           return -1;
4525         }
4526
4527       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4528       return n_perms == 0 ? 0 : 1;
4529     }
4530
4531   return 0;
4532 }
4533
4534 /* Decide which element layouts we should consider using.  Calculate the
4535    weights associated with inserting layout changes on partition edges.
4536    Also mark partitions that cannot change layout, by setting their
4537    layout to zero.  */
4538
4539 void
4540 vect_optimize_slp_pass::start_choosing_layouts ()
4541 {
4542   /* Used to assign unique permutation indices.  */
4543   using perm_hash = unbounded_hashmap_traits<
4544     vec_free_hash_base<int_hash_base<unsigned>>,
4545     int_hash<int, -1, -2>
4546   >;
4547   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4548
4549   /* Layout 0 is "no change".  */
4550   m_perms.safe_push (vNULL);
4551
4552   /* Create layouts from existing permutations.  */
4553   auto_load_permutation_t tmp_perm;
4554   for (unsigned int node_i : m_partitioned_nodes)
4555     {
4556       /* Leafs also double as entries to the reverse graph.  Allow the
4557          layout of those to be changed.  */
4558       auto &vertex = m_vertices[node_i];
4559       auto &partition = m_partitions[vertex.partition];
4560       if (!m_slpg->vertices[node_i].succ)
4561         partition.layout = 0;
4562
4563       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4564       slp_tree node = vertex.node;
4565       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4566       slp_tree child;
4567       unsigned HOST_WIDE_INT imin, imax = 0;
4568       bool any_permute = false;
4569       tmp_perm.truncate (0);
4570       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4571         {
4572           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4573              unpermuted, record a layout that reverses this permutation.
4574
4575              We would need more work to cope with loads that are internally
4576              permuted and also have inputs (such as masks for
4577              IFN_MASK_LOADs).  */
4578           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4579           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4580             continue;
4581           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4582           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4583           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4584         }
4585       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4586                && SLP_TREE_CHILDREN (node).length () == 1
4587                && (child = SLP_TREE_CHILDREN (node)[0])
4588                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4589                    .is_constant (&imin)))
4590         {
4591           /* If the child has the same vector size as this node,
4592              reversing the permutation can make the permutation a no-op.
4593              In other cases it can change a true permutation into a
4594              full-vector extract.  */
4595           tmp_perm.reserve (SLP_TREE_LANES (node));
4596           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4597             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4598         }
4599       else
4600         continue;
4601
4602       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4603         {
4604           unsigned idx = tmp_perm[j];
4605           imin = MIN (imin, idx);
4606           imax = MAX (imax, idx);
4607           if (idx - tmp_perm[0] != j)
4608             any_permute = true;
4609         }
4610       /* If the span doesn't match we'd disrupt VF computation, avoid
4611          that for now.  */
4612       if (imax - imin + 1 != SLP_TREE_LANES (node))
4613         continue;
4614       /* If there's no permute no need to split one out.  In this case
4615          we can consider turning a load into a permuted load, if that
4616          turns out to be cheaper than alternatives.  */
4617       if (!any_permute)
4618         {
4619           partition.layout = -1;
4620           continue;
4621         }
4622
4623       /* For now only handle true permutes, like
4624          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4625          when permuting constants and invariants keeping the permute
4626          bijective.  */
4627       auto_sbitmap load_index (SLP_TREE_LANES (node));
4628       bitmap_clear (load_index);
4629       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4630         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4631       unsigned j;
4632       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4633         if (!bitmap_bit_p (load_index, j))
4634           break;
4635       if (j != SLP_TREE_LANES (node))
4636         continue;
4637
4638       vec<unsigned> perm = vNULL;
4639       perm.safe_grow (SLP_TREE_LANES (node), true);
4640       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4641         perm[j] = tmp_perm[j] - imin;
4642
4643       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4644         {
4645           /* Continue to use existing layouts, but don't add any more.  */
4646           int *entry = layout_ids.get (perm);
4647           partition.layout = entry ? *entry : 0;
4648           perm.release ();
4649         }
4650       else
4651         {
4652           bool existed;
4653           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4654           if (existed)
4655             perm.release ();
4656           else
4657             {
4658               layout_i = m_perms.length ();
4659               m_perms.safe_push (perm);
4660             }
4661           partition.layout = layout_i;
4662         }
4663     }
4664
4665   /* Initially assume that every layout is possible and has zero cost
4666      in every partition.  */
4667   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4668                                               * m_perms.length ());
4669
4670   /* We have to mark outgoing permutations facing non-reduction graph
4671      entries that are not represented as to be materialized.  */
4672   for (slp_instance instance : m_vinfo->slp_instances)
4673     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4674       {
4675         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4676         m_partitions[m_vertices[node_i].partition].layout = 0;
4677       }
4678
4679   /* Check which layouts each node and partition can handle.  Calculate the
4680      weights associated with inserting layout changes on edges.  */
4681   for (unsigned int node_i : m_partitioned_nodes)
4682     {
4683       auto &vertex = m_vertices[node_i];
4684       auto &partition = m_partitions[vertex.partition];
4685       slp_tree node = vertex.node;
4686
4687       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4688         {
4689           vertex.weight = vect_slp_node_weight (node);
4690
4691           /* We do not handle stores with a permutation, so all
4692              incoming permutations must have been materialized.
4693
4694              We also don't handle masked grouped loads, which lack a
4695              permutation vector.  In this case the memory locations
4696              form an implicit second input to the loads, on top of the
4697              explicit mask input, and the memory input's layout cannot
4698              be changed.
4699
4700              On the other hand, we do support permuting gather loads and
4701              masked gather loads, where each scalar load is independent
4702              of the others.  This can be useful if the address/index input
4703              benefits from permutation.  */
4704           if (STMT_VINFO_DATA_REF (rep)
4705               && STMT_VINFO_GROUPED_ACCESS (rep)
4706               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4707             partition.layout = 0;
4708
4709           /* We cannot change the layout of an operation that is
4710              not independent on lanes.  Note this is an explicit
4711              negative list since that's much shorter than the respective
4712              positive one but it's critical to keep maintaining it.  */
4713           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4714             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4715               {
4716               case CFN_COMPLEX_ADD_ROT90:
4717               case CFN_COMPLEX_ADD_ROT270:
4718               case CFN_COMPLEX_MUL:
4719               case CFN_COMPLEX_MUL_CONJ:
4720               case CFN_VEC_ADDSUB:
4721               case CFN_VEC_FMADDSUB:
4722               case CFN_VEC_FMSUBADD:
4723                 partition.layout = 0;
4724               default:;
4725               }
4726         }
4727
4728       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4729         {
4730           auto &other_vertex = m_vertices[other_node_i];
4731
4732           /* Count the number of edges from earlier partitions and the number
4733              of edges to later partitions.  */
4734           if (other_vertex.partition < vertex.partition)
4735             partition.in_degree += 1;
4736           else
4737             partition.out_degree += 1;
4738
4739           /* If the current node uses the result of OTHER_NODE_I, accumulate
4740              the effects of that.  */
4741           if (ud->src == int (node_i))
4742             {
4743               other_vertex.out_weight += vertex.weight;
4744               other_vertex.out_degree += 1;
4745             }
4746         };
4747       for_each_partition_edge (node_i, process_edge);
4748     }
4749 }
4750
4751 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4752    its current (provisional) choice of layout.  The inputs do not necessarily
4753    have the same layout as each other.  */
4754
4755 slpg_layout_cost
4756 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4757 {
4758   auto &vertex = m_vertices[node_i];
4759   slpg_layout_cost cost;
4760   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4761     {
4762       auto &other_vertex = m_vertices[other_node_i];
4763       if (other_vertex.partition < vertex.partition)
4764         {
4765           auto &other_partition = m_partitions[other_vertex.partition];
4766           auto &other_costs = partition_layout_costs (other_vertex.partition,
4767                                                       other_partition.layout);
4768           slpg_layout_cost this_cost = other_costs.in_cost;
4769           this_cost.add_serial_cost (other_costs.internal_cost);
4770           this_cost.split (other_partition.out_degree);
4771           cost.add_parallel_cost (this_cost);
4772         }
4773     };
4774   for_each_partition_edge (node_i, add_cost);
4775   return cost;
4776 }
4777
4778 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4779    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4780    slpg_layout_cost::impossible () if the change isn't possible.  */
4781
4782 slpg_layout_cost
4783 vect_optimize_slp_pass::
4784 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4785                   unsigned int layout2_i)
4786 {
4787   auto &def_vertex = m_vertices[ud->dest];
4788   auto &use_vertex = m_vertices[ud->src];
4789   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4790   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4791   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4792                                     use_layout_i);
4793   if (factor < 0)
4794     return slpg_layout_cost::impossible ();
4795
4796   /* We have a choice of putting the layout change at the site of the
4797      definition or at the site of the use.  Prefer the former when
4798      optimizing for size or when the execution frequency of the
4799      definition is no greater than the combined execution frequencies of
4800      the uses.  When putting the layout change at the site of the definition,
4801      divvy up the cost among all consumers.  */
4802   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4803     {
4804       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4805       cost.split (def_vertex.out_degree);
4806       return cost;
4807     }
4808   return { use_vertex.weight * factor, m_optimize_size };
4809 }
4810
4811 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4812    partition; FROM_NODE_I could be the definition node or the use node.
4813    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4814    Return the cost of any necessary fix-ups on edge UD, or return
4815    slpg_layout_cost::impossible () if the change isn't possible.
4816
4817    At this point, FROM_NODE_I's partition has chosen the cheapest
4818    layout based on the information available so far, but this choice
4819    is only provisional.  */
4820
4821 slpg_layout_cost
4822 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4823                                       unsigned int to_layout_i)
4824 {
4825   auto &from_vertex = m_vertices[from_node_i];
4826   unsigned int from_partition_i = from_vertex.partition;
4827   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4828   gcc_assert (from_partition.layout >= 0);
4829
4830   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4831      with its current layout preference.  */
4832   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4833   auto edge_cost = edge_layout_cost (ud, from_node_i,
4834                                      from_partition.layout, to_layout_i);
4835   if (edge_cost.is_possible ())
4836     {
4837       auto &from_costs = partition_layout_costs (from_partition_i,
4838                                                  from_partition.layout);
4839       cost = from_costs.in_cost;
4840       cost.add_serial_cost (from_costs.internal_cost);
4841       cost.split (from_partition.out_degree);
4842       cost.add_serial_cost (edge_cost);
4843     }
4844
4845   /* Take the minimum of that cost and the cost that applies if
4846      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4847   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4848                                                       to_layout_i);
4849   if (direct_layout_costs.is_possible ())
4850     {
4851       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4852       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4853       direct_cost.split (from_partition.out_degree);
4854       if (!cost.is_possible ()
4855           || direct_cost.is_better_than (cost, m_optimize_size))
4856         cost = direct_cost;
4857     }
4858
4859   return cost;
4860 }
4861
4862 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4863    partition; TO_NODE_I could be the definition node or the use node.
4864    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4865    return the cost of any necessary fix-ups on edge UD, or
4866    slpg_layout_cost::impossible () if the choice cannot be made.
4867
4868    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4869
4870 slpg_layout_cost
4871 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4872                                        unsigned int from_layout_i)
4873 {
4874   auto &to_vertex = m_vertices[to_node_i];
4875   unsigned int to_partition_i = to_vertex.partition;
4876   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4877   gcc_assert (to_partition.layout >= 0);
4878
4879   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4880      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4881      any other inputs keep their current choice of layout.  */
4882   auto &to_costs = partition_layout_costs (to_partition_i,
4883                                            to_partition.layout);
4884   if (ud->src == int (to_node_i)
4885       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4886     {
4887       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4888       auto old_layout = from_partition.layout;
4889       from_partition.layout = from_layout_i;
4890       int factor = internal_node_cost (to_vertex.node, -1,
4891                                        to_partition.layout);
4892       from_partition.layout = old_layout;
4893       if (factor >= 0)
4894         {
4895           slpg_layout_cost cost = to_costs.out_cost;
4896           cost.add_serial_cost ({ to_vertex.weight * factor,
4897                                   m_optimize_size });
4898           cost.split (to_partition.in_degree);
4899           return cost;
4900         }
4901     }
4902
4903   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4904   auto edge_cost = edge_layout_cost (ud, to_node_i,
4905                                      to_partition.layout, from_layout_i);
4906   if (edge_cost.is_possible ())
4907     {
4908       slpg_layout_cost cost = to_costs.out_cost;
4909       cost.add_serial_cost (to_costs.internal_cost);
4910       cost.split (to_partition.in_degree);
4911       cost.add_serial_cost (edge_cost);
4912       return cost;
4913     }
4914
4915   return slpg_layout_cost::impossible ();
4916 }
4917
4918 /* Make a forward pass through the partitions, accumulating input costs.
4919    Make a tentative (provisional) choice of layout for each partition,
4920    ensuring that this choice still allows later partitions to keep
4921    their original layout.  */
4922
4923 void
4924 vect_optimize_slp_pass::forward_pass ()
4925 {
4926   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4927        ++partition_i)
4928     {
4929       auto &partition = m_partitions[partition_i];
4930
4931       /* If the partition consists of a single VEC_PERM_EXPR, precompute
4932          the incoming cost that would apply if every predecessor partition
4933          keeps its current layout.  This is used within the loop below.  */
4934       slpg_layout_cost in_cost;
4935       slp_tree single_node = nullptr;
4936       if (partition.node_end == partition.node_begin + 1)
4937         {
4938           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4939           single_node = m_vertices[node_i].node;
4940           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4941             in_cost = total_in_cost (node_i);
4942         }
4943
4944       /* Go through the possible layouts.  Decide which ones are valid
4945          for this partition and record which of the valid layouts has
4946          the lowest cost.  */
4947       unsigned int min_layout_i = 0;
4948       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4949       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4950         {
4951           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4952           if (!layout_costs.is_possible ())
4953             continue;
4954
4955           /* If the recorded layout is already 0 then the layout cannot
4956              change.  */
4957           if (partition.layout == 0 && layout_i != 0)
4958             {
4959               layout_costs.mark_impossible ();
4960               continue;
4961             }
4962
4963           bool is_possible = true;
4964           for (unsigned int order_i = partition.node_begin;
4965                order_i < partition.node_end; ++order_i)
4966             {
4967               unsigned int node_i = m_partitioned_nodes[order_i];
4968               auto &vertex = m_vertices[node_i];
4969
4970               /* Reject the layout if it is individually incompatible
4971                  with any node in the partition.  */
4972               if (!is_compatible_layout (vertex.node, layout_i))
4973                 {
4974                   is_possible = false;
4975                   break;
4976                 }
4977
4978               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4979                 {
4980                   auto &other_vertex = m_vertices[other_node_i];
4981                   if (other_vertex.partition < vertex.partition)
4982                     {
4983                       /* Accumulate the incoming costs from earlier
4984                          partitions, plus the cost of any layout changes
4985                          on UD itself.  */
4986                       auto cost = forward_cost (ud, other_node_i, layout_i);
4987                       if (!cost.is_possible ())
4988                         is_possible = false;
4989                       else
4990                         layout_costs.in_cost.add_parallel_cost (cost);
4991                     }
4992                   else
4993                     /* Reject the layout if it would make layout 0 impossible
4994                        for later partitions.  This amounts to testing that the
4995                        target supports reversing the layout change on edges
4996                        to later partitions.
4997
4998                        In principle, it might be possible to push a layout
4999                        change all the way down a graph, so that it never
5000                        needs to be reversed and so that the target doesn't
5001                        need to support the reverse operation.  But it would
5002                        be awkward to bail out if we hit a partition that
5003                        does not support the new layout, especially since
5004                        we are not dealing with a lattice.  */
5005                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5006                                                      layout_i).is_possible ();
5007                 };
5008               for_each_partition_edge (node_i, add_cost);
5009
5010               /* Accumulate the cost of using LAYOUT_I within NODE,
5011                  both for the inputs and the outputs.  */
5012               int factor = internal_node_cost (vertex.node, layout_i,
5013                                                layout_i);
5014               if (factor < 0)
5015                 {
5016                   is_possible = false;
5017                   break;
5018                 }
5019               else if (factor)
5020                 layout_costs.internal_cost.add_serial_cost
5021                   ({ vertex.weight * factor, m_optimize_size });
5022             }
5023           if (!is_possible)
5024             {
5025               layout_costs.mark_impossible ();
5026               continue;
5027             }
5028
5029           /* Combine the incoming and partition-internal costs.  */
5030           slpg_layout_cost combined_cost = layout_costs.in_cost;
5031           combined_cost.add_serial_cost (layout_costs.internal_cost);
5032
5033           /* If this partition consists of a single VEC_PERM_EXPR, see
5034              if the VEC_PERM_EXPR can be changed to support output layout
5035              LAYOUT_I while keeping all the provisional choices of input
5036              layout.  */
5037           if (single_node
5038               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5039             {
5040               int factor = internal_node_cost (single_node, -1, layout_i);
5041               if (factor >= 0)
5042                 {
5043                   auto weight = m_vertices[single_node->vertex].weight;
5044                   slpg_layout_cost internal_cost
5045                     = { weight * factor, m_optimize_size };
5046
5047                   slpg_layout_cost alt_cost = in_cost;
5048                   alt_cost.add_serial_cost (internal_cost);
5049                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5050                     {
5051                       combined_cost = alt_cost;
5052                       layout_costs.in_cost = in_cost;
5053                       layout_costs.internal_cost = internal_cost;
5054                     }
5055                 }
5056             }
5057
5058           /* Record the layout with the lowest cost.  Prefer layout 0 in
5059              the event of a tie between it and another layout.  */
5060           if (!min_layout_cost.is_possible ()
5061               || combined_cost.is_better_than (min_layout_cost,
5062                                                m_optimize_size))
5063             {
5064               min_layout_i = layout_i;
5065               min_layout_cost = combined_cost;
5066             }
5067         }
5068
5069       /* This loop's handling of earlier partitions should ensure that
5070          choosing the original layout for the current partition is no
5071          less valid than it was in the original graph, even with the
5072          provisional layout choices for those earlier partitions.  */
5073       gcc_assert (min_layout_cost.is_possible ());
5074       partition.layout = min_layout_i;
5075     }
5076 }
5077
5078 /* Make a backward pass through the partitions, accumulating output costs.
5079    Make a final choice of layout for each partition.  */
5080
5081 void
5082 vect_optimize_slp_pass::backward_pass ()
5083 {
5084   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5085     {
5086       auto &partition = m_partitions[partition_i];
5087
5088       unsigned int min_layout_i = 0;
5089       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5090       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5091         {
5092           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5093           if (!layout_costs.is_possible ())
5094             continue;
5095
5096           /* Accumulate the costs from successor partitions.  */
5097           bool is_possible = true;
5098           for (unsigned int order_i = partition.node_begin;
5099                order_i < partition.node_end; ++order_i)
5100             {
5101               unsigned int node_i = m_partitioned_nodes[order_i];
5102               auto &vertex = m_vertices[node_i];
5103               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5104                 {
5105                   auto &other_vertex = m_vertices[other_node_i];
5106                   auto &other_partition = m_partitions[other_vertex.partition];
5107                   if (other_vertex.partition > vertex.partition)
5108                     {
5109                       /* Accumulate the incoming costs from later
5110                          partitions, plus the cost of any layout changes
5111                          on UD itself.  */
5112                       auto cost = backward_cost (ud, other_node_i, layout_i);
5113                       if (!cost.is_possible ())
5114                         is_possible = false;
5115                       else
5116                         layout_costs.out_cost.add_parallel_cost (cost);
5117                     }
5118                   else
5119                     /* Make sure that earlier partitions can (if necessary
5120                        or beneficial) keep the layout that they chose in
5121                        the forward pass.  This ensures that there is at
5122                        least one valid choice of layout.  */
5123                     is_possible &= edge_layout_cost (ud, other_node_i,
5124                                                      other_partition.layout,
5125                                                      layout_i).is_possible ();
5126                 };
5127               for_each_partition_edge (node_i, add_cost);
5128             }
5129           if (!is_possible)
5130             {
5131               layout_costs.mark_impossible ();
5132               continue;
5133             }
5134
5135           /* Locally combine the costs from the forward and backward passes.
5136              (This combined cost is not passed on, since that would lead
5137              to double counting.)  */
5138           slpg_layout_cost combined_cost = layout_costs.in_cost;
5139           combined_cost.add_serial_cost (layout_costs.internal_cost);
5140           combined_cost.add_serial_cost (layout_costs.out_cost);
5141
5142           /* Record the layout with the lowest cost.  Prefer layout 0 in
5143              the event of a tie between it and another layout.  */
5144           if (!min_layout_cost.is_possible ()
5145               || combined_cost.is_better_than (min_layout_cost,
5146                                                m_optimize_size))
5147             {
5148               min_layout_i = layout_i;
5149               min_layout_cost = combined_cost;
5150             }
5151         }
5152
5153       gcc_assert (min_layout_cost.is_possible ());
5154       partition.layout = min_layout_i;
5155     }
5156 }
5157
5158 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5159    NODE already has the layout that was selected for its partition.  */
5160
5161 slp_tree
5162 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5163                                                 unsigned int to_layout_i)
5164 {
5165   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5166   slp_tree result = m_node_layouts[result_i];
5167   if (result)
5168     return result;
5169
5170   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5171       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5172     {
5173       /* If the vector is uniform or unchanged, there's nothing to do.  */
5174       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5175         result = node;
5176       else
5177         {
5178           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5179           result = vect_create_new_slp_node (scalar_ops);
5180           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5181         }
5182     }
5183   else
5184     {
5185       unsigned int partition_i = m_vertices[node->vertex].partition;
5186       unsigned int from_layout_i = m_partitions[partition_i].layout;
5187       if (from_layout_i == to_layout_i)
5188         return node;
5189
5190       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5191          permutation instead of a serial one.  Leave the new permutation
5192          in TMP_PERM on success.  */
5193       auto_lane_permutation_t tmp_perm;
5194       unsigned int num_inputs = 1;
5195       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5196         {
5197           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5198           if (from_layout_i != 0)
5199             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5200           if (to_layout_i != 0)
5201             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5202           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5203                                               tmp_perm,
5204                                               SLP_TREE_CHILDREN (node),
5205                                               false) >= 0)
5206             num_inputs = SLP_TREE_CHILDREN (node).length ();
5207           else
5208             tmp_perm.truncate (0);
5209         }
5210
5211       if (dump_enabled_p ())
5212         {
5213           if (tmp_perm.length () > 0)
5214             dump_printf_loc (MSG_NOTE, vect_location,
5215                              "duplicating permutation node %p with"
5216                              " layout %d\n",
5217                              (void *) node, to_layout_i);
5218           else
5219             dump_printf_loc (MSG_NOTE, vect_location,
5220                              "inserting permutation node in place of %p\n",
5221                              (void *) node);
5222         }
5223
5224       unsigned int num_lanes = SLP_TREE_LANES (node);
5225       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5226       if (SLP_TREE_SCALAR_STMTS (node).length ())
5227         {
5228           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5229           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5230           if (from_layout_i != 0)
5231             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5232           if (to_layout_i != 0)
5233             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5234         }
5235       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5236       SLP_TREE_LANES (result) = num_lanes;
5237       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5238       result->vertex = -1;
5239
5240       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5241       if (tmp_perm.length ())
5242         {
5243           lane_perm.safe_splice (tmp_perm);
5244           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5245         }
5246       else
5247         {
5248           lane_perm.create (num_lanes);
5249           for (unsigned j = 0; j < num_lanes; ++j)
5250             lane_perm.quick_push ({ 0, j });
5251           if (from_layout_i != 0)
5252             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5253           if (to_layout_i != 0)
5254             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5255           SLP_TREE_CHILDREN (result).safe_push (node);
5256         }
5257       for (slp_tree child : SLP_TREE_CHILDREN (result))
5258         child->refcnt++;
5259     }
5260   m_node_layouts[result_i] = result;
5261   return result;
5262 }
5263
5264 /* Apply the chosen vector layouts to the SLP graph.  */
5265
5266 void
5267 vect_optimize_slp_pass::materialize ()
5268 {
5269   /* We no longer need the costs, so avoid having two O(N * P) arrays
5270      live at the same time.  */
5271   m_partition_layout_costs.release ();
5272   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5273
5274   auto_sbitmap fully_folded (m_vertices.length ());
5275   bitmap_clear (fully_folded);
5276   for (unsigned int node_i : m_partitioned_nodes)
5277     {
5278       auto &vertex = m_vertices[node_i];
5279       slp_tree node = vertex.node;
5280       int layout_i = m_partitions[vertex.partition].layout;
5281       gcc_assert (layout_i >= 0);
5282
5283       /* Rearrange the scalar statements to match the chosen layout.  */
5284       if (layout_i > 0)
5285         vect_slp_permute (m_perms[layout_i],
5286                           SLP_TREE_SCALAR_STMTS (node), true);
5287
5288       /* Update load and lane permutations.  */
5289       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5290         {
5291           /* First try to absorb the input vector layouts.  If that fails,
5292              force the inputs to have layout LAYOUT_I too.  We checked that
5293              that was possible before deciding to use nonzero output layouts.
5294              (Note that at this stage we don't really have any guarantee that
5295              the target supports the original VEC_PERM_EXPR.)  */
5296           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5297           auto_lane_permutation_t tmp_perm;
5298           tmp_perm.safe_splice (perm);
5299           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5300           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5301                                               tmp_perm,
5302                                               SLP_TREE_CHILDREN (node),
5303                                               false) >= 0)
5304             {
5305               if (dump_enabled_p ()
5306                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5307                                   perm.begin ()))
5308                 dump_printf_loc (MSG_NOTE, vect_location,
5309                                  "absorbing input layouts into %p\n",
5310                                  (void *) node);
5311               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5312               bitmap_set_bit (fully_folded, node_i);
5313             }
5314           else
5315             {
5316               /* Not MSG_MISSED because it would make no sense to users.  */
5317               if (dump_enabled_p ())
5318                 dump_printf_loc (MSG_NOTE, vect_location,
5319                                  "failed to absorb input layouts into %p\n",
5320                                  (void *) node);
5321               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5322             }
5323         }
5324       else
5325         {
5326           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5327           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5328           if (layout_i > 0)
5329             /* ???  When we handle non-bijective permutes the idea
5330                is that we can force the load-permutation to be
5331                { min, min + 1, min + 2, ... max }.  But then the
5332                scalar defs might no longer match the lane content
5333                which means wrong-code with live lane vectorization.
5334                So we possibly have to have NULL entries for those.  */
5335             vect_slp_permute (m_perms[layout_i], load_perm, true);
5336         }
5337     }
5338
5339   /* Do this before any nodes disappear, since it involves a walk
5340      over the leaves.  */
5341   remove_redundant_permutations ();
5342
5343   /* Replace each child with a correctly laid-out version.  */
5344   for (unsigned int node_i : m_partitioned_nodes)
5345     {
5346       /* Skip nodes that have already been handled above.  */
5347       if (bitmap_bit_p (fully_folded, node_i))
5348         continue;
5349
5350       auto &vertex = m_vertices[node_i];
5351       int in_layout_i = m_partitions[vertex.partition].layout;
5352       gcc_assert (in_layout_i >= 0);
5353
5354       unsigned j;
5355       slp_tree child;
5356       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5357         {
5358           if (!child)
5359             continue;
5360
5361           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5362           if (new_child != child)
5363             {
5364               vect_free_slp_tree (child);
5365               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5366               new_child->refcnt += 1;
5367             }
5368         }
5369     }
5370 }
5371
5372 /* Elide load permutations that are not necessary.  Such permutations might
5373    be pre-existing, rather than created by the layout optimizations.  */
5374
5375 void
5376 vect_optimize_slp_pass::remove_redundant_permutations ()
5377 {
5378   for (unsigned int node_i : m_leafs)
5379     {
5380       slp_tree node = m_vertices[node_i].node;
5381       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5382         continue;
5383
5384       /* In basic block vectorization we allow any subchain of an interleaving
5385          chain.
5386          FORNOW: not in loop SLP because of realignment complications.  */
5387       if (is_a <bb_vec_info> (m_vinfo))
5388         {
5389           bool subchain_p = true;
5390           stmt_vec_info next_load_info = NULL;
5391           stmt_vec_info load_info;
5392           unsigned j;
5393           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5394             {
5395               if (j != 0
5396                   && (next_load_info != load_info
5397                       || DR_GROUP_GAP (load_info) != 1))
5398                 {
5399                   subchain_p = false;
5400                   break;
5401                 }
5402               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5403             }
5404           if (subchain_p)
5405             {
5406               SLP_TREE_LOAD_PERMUTATION (node).release ();
5407               continue;
5408             }
5409         }
5410       else
5411         {
5412           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5413           stmt_vec_info load_info;
5414           bool this_load_permuted = false;
5415           unsigned j;
5416           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5417             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5418               {
5419                 this_load_permuted = true;
5420                 break;
5421               }
5422           stmt_vec_info first_stmt_info
5423             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5424           if (!this_load_permuted
5425               /* The load requires permutation when unrolling exposes
5426                  a gap either because the group is larger than the SLP
5427                  group-size or because there is a gap between the groups.  */
5428               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5429                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5430                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5431             {
5432               SLP_TREE_LOAD_PERMUTATION (node).release ();
5433               continue;
5434             }
5435         }
5436     }
5437 }
5438
5439 /* Print the partition graph and layout information to the dump file.  */
5440
5441 void
5442 vect_optimize_slp_pass::dump ()
5443 {
5444   dump_printf_loc (MSG_NOTE, vect_location,
5445                    "SLP optimize permutations:\n");
5446   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5447     {
5448       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5449       const char *sep = "";
5450       for (unsigned int idx : m_perms[layout_i])
5451         {
5452           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5453           sep = ", ";
5454         }
5455       dump_printf (MSG_NOTE, " }\n");
5456     }
5457   dump_printf_loc (MSG_NOTE, vect_location,
5458                    "SLP optimize partitions:\n");
5459   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5460        ++partition_i)
5461     {
5462       auto &partition = m_partitions[partition_i];
5463       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5464       dump_printf_loc (MSG_NOTE, vect_location,
5465                        "  partition %d (layout %d):\n",
5466                        partition_i, partition.layout);
5467       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5468       for (unsigned int order_i = partition.node_begin;
5469            order_i < partition.node_end; ++order_i)
5470         {
5471           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5472           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5473                            (void *) vertex.node);
5474           dump_printf_loc (MSG_NOTE, vect_location,
5475                            "          weight: %f\n",
5476                            vertex.weight.to_double ());
5477           if (vertex.out_degree)
5478             dump_printf_loc (MSG_NOTE, vect_location,
5479                              "          out weight: %f (degree %d)\n",
5480                              vertex.out_weight.to_double (),
5481                              vertex.out_degree);
5482           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5483             dump_printf_loc (MSG_NOTE, vect_location,
5484                              "          op: VEC_PERM_EXPR\n");
5485           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5486             dump_printf_loc (MSG_NOTE, vect_location,
5487                              "          op template: %G", rep->stmt);
5488         }
5489       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5490       for (unsigned int order_i = partition.node_begin;
5491            order_i < partition.node_end; ++order_i)
5492         {
5493           unsigned int node_i = m_partitioned_nodes[order_i];
5494           auto &vertex = m_vertices[node_i];
5495           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5496             {
5497               auto &other_vertex = m_vertices[other_node_i];
5498               if (other_vertex.partition < vertex.partition)
5499                 dump_printf_loc (MSG_NOTE, vect_location,
5500                                  "      - %p [%d] --> %p\n",
5501                                  (void *) other_vertex.node,
5502                                  other_vertex.partition,
5503                                  (void *) vertex.node);
5504               else
5505                 dump_printf_loc (MSG_NOTE, vect_location,
5506                                  "      - %p --> [%d] %p\n",
5507                                  (void *) vertex.node,
5508                                  other_vertex.partition,
5509                                  (void *) other_vertex.node);
5510             };
5511           for_each_partition_edge (node_i, print_edge);
5512         }
5513
5514       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5515         {
5516           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5517           if (layout_costs.is_possible ())
5518             {
5519               dump_printf_loc (MSG_NOTE, vect_location,
5520                                "    layout %d:%s\n", layout_i,
5521                                partition.layout == int (layout_i)
5522                                ? " (*)" : "");
5523               slpg_layout_cost combined_cost = layout_costs.in_cost;
5524               combined_cost.add_serial_cost (layout_costs.internal_cost);
5525               combined_cost.add_serial_cost (layout_costs.out_cost);
5526 #define TEMPLATE "{depth: %f, total: %f}"
5527               dump_printf_loc (MSG_NOTE, vect_location,
5528                                "        " TEMPLATE "\n",
5529                                layout_costs.in_cost.depth.to_double (),
5530                                layout_costs.in_cost.total.to_double ());
5531               dump_printf_loc (MSG_NOTE, vect_location,
5532                                "      + " TEMPLATE "\n",
5533                                layout_costs.internal_cost.depth.to_double (),
5534                                layout_costs.internal_cost.total.to_double ());
5535               dump_printf_loc (MSG_NOTE, vect_location,
5536                                "      + " TEMPLATE "\n",
5537                                layout_costs.out_cost.depth.to_double (),
5538                                layout_costs.out_cost.total.to_double ());
5539               dump_printf_loc (MSG_NOTE, vect_location,
5540                                "      = " TEMPLATE "\n",
5541                                combined_cost.depth.to_double (),
5542                                combined_cost.total.to_double ());
5543 #undef TEMPLATE
5544             }
5545           else
5546             dump_printf_loc (MSG_NOTE, vect_location,
5547                              "    layout %d: rejected\n", layout_i);
5548         }
5549     }
5550 }
5551
5552 /* Main entry point for the SLP graph optimization pass.  */
5553
5554 void
5555 vect_optimize_slp_pass::run ()
5556 {
5557   build_graph ();
5558   create_partitions ();
5559   start_choosing_layouts ();
5560   if (m_perms.length () > 1)
5561     {
5562       forward_pass ();
5563       backward_pass ();
5564       if (dump_enabled_p ())
5565         dump ();
5566       materialize ();
5567       while (!m_perms.is_empty ())
5568         m_perms.pop ().release ();
5569     }
5570   else
5571     remove_redundant_permutations ();
5572   free_graph (m_slpg);
5573 }
5574
5575 /* Optimize the SLP graph of VINFO.  */
5576
5577 void
5578 vect_optimize_slp (vec_info *vinfo)
5579 {
5580   if (vinfo->slp_instances.is_empty ())
5581     return;
5582   vect_optimize_slp_pass (vinfo).run ();
5583 }
5584
5585 /* Gather loads reachable from the individual SLP graph entries.  */
5586
5587 void
5588 vect_gather_slp_loads (vec_info *vinfo)
5589 {
5590   unsigned i;
5591   slp_instance instance;
5592   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5593     {
5594       hash_set<slp_tree> visited;
5595       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5596                              SLP_INSTANCE_TREE (instance), visited);
5597     }
5598 }
5599
5600
5601 /* For each possible SLP instance decide whether to SLP it and calculate overall
5602    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5603    least one instance.  */
5604
5605 bool
5606 vect_make_slp_decision (loop_vec_info loop_vinfo)
5607 {
5608   unsigned int i;
5609   poly_uint64 unrolling_factor = 1;
5610   const vec<slp_instance> &slp_instances
5611     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5612   slp_instance instance;
5613   int decided_to_slp = 0;
5614
5615   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5616
5617   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5618     {
5619       /* FORNOW: SLP if you can.  */
5620       /* All unroll factors have the form:
5621
5622            GET_MODE_SIZE (vinfo->vector_mode) * X
5623
5624          for some rational X, so they must have a common multiple.  */
5625       unrolling_factor
5626         = force_common_multiple (unrolling_factor,
5627                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5628
5629       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5630          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5631          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5632       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5633       decided_to_slp++;
5634     }
5635
5636   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5637
5638   if (decided_to_slp && dump_enabled_p ())
5639     {
5640       dump_printf_loc (MSG_NOTE, vect_location,
5641                        "Decided to SLP %d instances. Unrolling factor ",
5642                        decided_to_slp);
5643       dump_dec (MSG_NOTE, unrolling_factor);
5644       dump_printf (MSG_NOTE, "\n");
5645     }
5646
5647   return (decided_to_slp > 0);
5648 }
5649
5650 /* Private data for vect_detect_hybrid_slp.  */
5651 struct vdhs_data
5652 {
5653   loop_vec_info loop_vinfo;
5654   vec<stmt_vec_info> *worklist;
5655 };
5656
5657 /* Walker for walk_gimple_op.  */
5658
5659 static tree
5660 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5661 {
5662   walk_stmt_info *wi = (walk_stmt_info *)data;
5663   vdhs_data *dat = (vdhs_data *)wi->info;
5664
5665   if (wi->is_lhs)
5666     return NULL_TREE;
5667
5668   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5669   if (!def_stmt_info)
5670     return NULL_TREE;
5671   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5672   if (PURE_SLP_STMT (def_stmt_info))
5673     {
5674       if (dump_enabled_p ())
5675         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5676                          def_stmt_info->stmt);
5677       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5678       dat->worklist->safe_push (def_stmt_info);
5679     }
5680
5681   return NULL_TREE;
5682 }
5683
5684 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5685    if so, otherwise pushing it to WORKLIST.  */
5686
5687 static void
5688 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5689                                vec<stmt_vec_info> &worklist,
5690                                stmt_vec_info stmt_info)
5691 {
5692   if (dump_enabled_p ())
5693     dump_printf_loc (MSG_NOTE, vect_location,
5694                      "Processing hybrid candidate : %G", stmt_info->stmt);
5695   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5696   imm_use_iterator iter2;
5697   ssa_op_iter iter1;
5698   use_operand_p use_p;
5699   def_operand_p def_p;
5700   bool any_def = false;
5701   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5702     {
5703       any_def = true;
5704       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5705         {
5706           if (is_gimple_debug (USE_STMT (use_p)))
5707             continue;
5708           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5709           /* An out-of loop use means this is a loop_vect sink.  */
5710           if (!use_info)
5711             {
5712               if (dump_enabled_p ())
5713                 dump_printf_loc (MSG_NOTE, vect_location,
5714                                  "Found loop_vect sink: %G", stmt_info->stmt);
5715               worklist.safe_push (stmt_info);
5716               return;
5717             }
5718           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5719             {
5720               if (dump_enabled_p ())
5721                 dump_printf_loc (MSG_NOTE, vect_location,
5722                                  "Found loop_vect use: %G", use_info->stmt);
5723               worklist.safe_push (stmt_info);
5724               return;
5725             }
5726         }
5727     }
5728   /* No def means this is a loo_vect sink.  */
5729   if (!any_def)
5730     {
5731       if (dump_enabled_p ())
5732         dump_printf_loc (MSG_NOTE, vect_location,
5733                          "Found loop_vect sink: %G", stmt_info->stmt);
5734       worklist.safe_push (stmt_info);
5735       return;
5736     }
5737   if (dump_enabled_p ())
5738     dump_printf_loc (MSG_NOTE, vect_location,
5739                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5740   STMT_SLP_TYPE (stmt_info) = pure_slp;
5741 }
5742
5743 /* Find stmts that must be both vectorized and SLPed.  */
5744
5745 void
5746 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5747 {
5748   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5749
5750   /* All stmts participating in SLP are marked pure_slp, all other
5751      stmts are loop_vect.
5752      First collect all loop_vect stmts into a worklist.
5753      SLP patterns cause not all original scalar stmts to appear in
5754      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5755      Rectify this here and do a backward walk over the IL only considering
5756      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5757      mark them as pure_slp.  */
5758   auto_vec<stmt_vec_info> worklist;
5759   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5760     {
5761       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5762       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5763            gsi_next (&gsi))
5764         {
5765           gphi *phi = gsi.phi ();
5766           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5767           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5768             maybe_push_to_hybrid_worklist (loop_vinfo,
5769                                            worklist, stmt_info);
5770         }
5771       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5772            gsi_prev (&gsi))
5773         {
5774           gimple *stmt = gsi_stmt (gsi);
5775           if (is_gimple_debug (stmt))
5776             continue;
5777           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5778           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5779             {
5780               for (gimple_stmt_iterator gsi2
5781                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5782                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5783                 {
5784                   stmt_vec_info patt_info
5785                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5786                   if (!STMT_SLP_TYPE (patt_info)
5787                       && STMT_VINFO_RELEVANT (patt_info))
5788                     maybe_push_to_hybrid_worklist (loop_vinfo,
5789                                                    worklist, patt_info);
5790                 }
5791               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5792             }
5793           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5794             maybe_push_to_hybrid_worklist (loop_vinfo,
5795                                            worklist, stmt_info);
5796         }
5797     }
5798
5799   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5800      mark any SLP vectorized stmt as hybrid.
5801      ???  We're visiting def stmts N times (once for each non-SLP and
5802      once for each hybrid-SLP use).  */
5803   walk_stmt_info wi;
5804   vdhs_data dat;
5805   dat.worklist = &worklist;
5806   dat.loop_vinfo = loop_vinfo;
5807   memset (&wi, 0, sizeof (wi));
5808   wi.info = (void *)&dat;
5809   while (!worklist.is_empty ())
5810     {
5811       stmt_vec_info stmt_info = worklist.pop ();
5812       /* Since SSA operands are not set up for pattern stmts we need
5813          to use walk_gimple_op.  */
5814       wi.is_lhs = 0;
5815       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5816       /* For gather/scatter make sure to walk the offset operand, that
5817          can be a scaling and conversion away.  */
5818       gather_scatter_info gs_info;
5819       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5820           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5821         {
5822           int dummy;
5823           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5824         }
5825     }
5826 }
5827
5828
5829 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5830
5831 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5832   : vec_info (vec_info::bb, shared),
5833     bbs (_bbs),
5834     roots (vNULL)
5835 {
5836   for (unsigned i = 0; i < bbs.length (); ++i)
5837     {
5838       if (i != 0)
5839         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5840              gsi_next (&si))
5841           {
5842             gphi *phi = si.phi ();
5843             gimple_set_uid (phi, 0);
5844             add_stmt (phi);
5845           }
5846       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5847            !gsi_end_p (gsi); gsi_next (&gsi))
5848         {
5849           gimple *stmt = gsi_stmt (gsi);
5850           gimple_set_uid (stmt, 0);
5851           if (is_gimple_debug (stmt))
5852             continue;
5853           add_stmt (stmt);
5854         }
5855     }
5856 }
5857
5858
5859 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5860    stmts in the basic block.  */
5861
5862 _bb_vec_info::~_bb_vec_info ()
5863 {
5864   /* Reset region marker.  */
5865   for (unsigned i = 0; i < bbs.length (); ++i)
5866     {
5867       if (i != 0)
5868         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5869              gsi_next (&si))
5870           {
5871             gphi *phi = si.phi ();
5872             gimple_set_uid (phi, -1);
5873           }
5874       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5875            !gsi_end_p (gsi); gsi_next (&gsi))
5876         {
5877           gimple *stmt = gsi_stmt (gsi);
5878           gimple_set_uid (stmt, -1);
5879         }
5880     }
5881
5882   for (unsigned i = 0; i < roots.length (); ++i)
5883     {
5884       roots[i].stmts.release ();
5885       roots[i].roots.release ();
5886     }
5887   roots.release ();
5888 }
5889
5890 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5891    given then that child nodes have already been processed, and that
5892    their def types currently match their SLP node's def type.  */
5893
5894 static bool
5895 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5896                                     slp_instance node_instance,
5897                                     stmt_vector_for_cost *cost_vec)
5898 {
5899   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5900
5901   /* Calculate the number of vector statements to be created for the
5902      scalar stmts in this node.  For SLP reductions it is equal to the
5903      number of vector statements in the children (which has already been
5904      calculated by the recursive call).  Otherwise it is the number of
5905      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5906      VF divided by the number of elements in a vector.  */
5907   if (!STMT_VINFO_DATA_REF (stmt_info)
5908       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5909     {
5910       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5911         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5912           {
5913             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5914               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5915             break;
5916           }
5917     }
5918   else
5919     {
5920       poly_uint64 vf;
5921       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5922         vf = loop_vinfo->vectorization_factor;
5923       else
5924         vf = 1;
5925       unsigned int group_size = SLP_TREE_LANES (node);
5926       tree vectype = SLP_TREE_VECTYPE (node);
5927       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5928         = vect_get_num_vectors (vf * group_size, vectype);
5929     }
5930
5931   /* Handle purely internal nodes.  */
5932   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5933     return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
5934
5935   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
5936
5937   bool dummy;
5938   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5939                             node, node_instance, cost_vec);
5940 }
5941
5942 /* Try to build NODE from scalars, returning true on success.
5943    NODE_INSTANCE is the SLP instance that contains NODE.  */
5944
5945 static bool
5946 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5947                               slp_instance node_instance)
5948 {
5949   stmt_vec_info stmt_info;
5950   unsigned int i;
5951
5952   if (!is_a <bb_vec_info> (vinfo)
5953       || node == SLP_INSTANCE_TREE (node_instance)
5954       || !SLP_TREE_SCALAR_STMTS (node).exists ()
5955       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5956       /* Force the mask use to be built from scalars instead.  */
5957       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5958     return false;
5959
5960   if (dump_enabled_p ())
5961     dump_printf_loc (MSG_NOTE, vect_location,
5962                      "Building vector operands of %p from scalars instead\n",
5963                      (void *) node);
5964
5965   /* Don't remove and free the child nodes here, since they could be
5966      referenced by other structures.  The analysis and scheduling phases
5967      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
5968   unsigned int group_size = SLP_TREE_LANES (node);
5969   SLP_TREE_DEF_TYPE (node) = vect_external_def;
5970   /* Invariants get their vector type from the uses.  */
5971   SLP_TREE_VECTYPE (node) = NULL_TREE;
5972   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5973   SLP_TREE_LOAD_PERMUTATION (node).release ();
5974   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5975     {
5976       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5977       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5978     }
5979   return true;
5980 }
5981
5982 /* Return true if all elements of the slice are the same.  */
5983 bool
5984 vect_scalar_ops_slice::all_same_p () const
5985 {
5986   for (unsigned int i = 1; i < length; ++i)
5987     if (!operand_equal_p (op (0), op (i)))
5988       return false;
5989   return true;
5990 }
5991
5992 hashval_t
5993 vect_scalar_ops_slice_hash::hash (const value_type &s)
5994 {
5995   hashval_t hash = 0;
5996   for (unsigned i = 0; i < s.length; ++i)
5997     hash = iterative_hash_expr (s.op (i), hash);
5998   return hash;
5999 }
6000
6001 bool
6002 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6003                                    const compare_type &s2)
6004 {
6005   if (s1.length != s2.length)
6006     return false;
6007   for (unsigned i = 0; i < s1.length; ++i)
6008     if (!operand_equal_p (s1.op (i), s2.op (i)))
6009       return false;
6010   return true;
6011 }
6012
6013 /* Compute the prologue cost for invariant or constant operands represented
6014    by NODE.  */
6015
6016 static void
6017 vect_prologue_cost_for_slp (slp_tree node,
6018                             stmt_vector_for_cost *cost_vec)
6019 {
6020   /* There's a special case of an existing vector, that costs nothing.  */
6021   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6022       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6023     return;
6024   /* Without looking at the actual initializer a vector of
6025      constants can be implemented as load from the constant pool.
6026      When all elements are the same we can use a splat.  */
6027   tree vectype = SLP_TREE_VECTYPE (node);
6028   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6029   unsigned HOST_WIDE_INT const_nunits;
6030   unsigned nelt_limit;
6031   auto ops = &SLP_TREE_SCALAR_OPS (node);
6032   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6033   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6034       && ! multiple_p (const_nunits, group_size))
6035     {
6036       nelt_limit = const_nunits;
6037       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6038       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6039         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6040           starts.quick_push (i * const_nunits);
6041     }
6042   else
6043     {
6044       /* If either the vector has variable length or the vectors
6045          are composed of repeated whole groups we only need to
6046          cost construction once.  All vectors will be the same.  */
6047       nelt_limit = group_size;
6048       starts.quick_push (0);
6049     }
6050   /* ???  We're just tracking whether vectors in a single node are the same.
6051      Ideally we'd do something more global.  */
6052   for (unsigned int start : starts)
6053     {
6054       vect_cost_for_stmt kind;
6055       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6056         kind = vector_load;
6057       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6058         kind = scalar_to_vec;
6059       else
6060         kind = vec_construct;
6061       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
6062     }
6063 }
6064
6065 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6066    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6067
6068    Return true if the operations are supported.  */
6069
6070 static bool
6071 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6072                                   slp_instance node_instance,
6073                                   hash_set<slp_tree> &visited_set,
6074                                   vec<slp_tree> &visited_vec,
6075                                   stmt_vector_for_cost *cost_vec)
6076 {
6077   int i, j;
6078   slp_tree child;
6079
6080   /* Assume we can code-generate all invariants.  */
6081   if (!node
6082       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6083       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6084     return true;
6085
6086   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6087     {
6088       if (dump_enabled_p ())
6089         dump_printf_loc (MSG_NOTE, vect_location,
6090                          "Failed cyclic SLP reference in %p\n", (void *) node);
6091       return false;
6092     }
6093   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6094
6095   /* If we already analyzed the exact same set of scalar stmts we're done.
6096      We share the generated vector stmts for those.  */
6097   if (visited_set.add (node))
6098     return true;
6099   visited_vec.safe_push (node);
6100
6101   bool res = true;
6102   unsigned visited_rec_start = visited_vec.length ();
6103   unsigned cost_vec_rec_start = cost_vec->length ();
6104   bool seen_non_constant_child = false;
6105   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6106     {
6107       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6108                                               visited_set, visited_vec,
6109                                               cost_vec);
6110       if (!res)
6111         break;
6112       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6113         seen_non_constant_child = true;
6114     }
6115   /* We're having difficulties scheduling nodes with just constant
6116      operands and no scalar stmts since we then cannot compute a stmt
6117      insertion place.  */
6118   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6119     {
6120       if (dump_enabled_p ())
6121         dump_printf_loc (MSG_NOTE, vect_location,
6122                          "Cannot vectorize all-constant op node %p\n",
6123                          (void *) node);
6124       res = false;
6125     }
6126
6127   if (res)
6128     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6129                                               cost_vec);
6130   /* If analysis failed we have to pop all recursive visited nodes
6131      plus ourselves.  */
6132   if (!res)
6133     {
6134       while (visited_vec.length () >= visited_rec_start)
6135         visited_set.remove (visited_vec.pop ());
6136       cost_vec->truncate (cost_vec_rec_start);
6137     }
6138
6139   /* When the node can be vectorized cost invariant nodes it references.
6140      This is not done in DFS order to allow the refering node
6141      vectorizable_* calls to nail down the invariant nodes vector type
6142      and possibly unshare it if it needs a different vector type than
6143      other referrers.  */
6144   if (res)
6145     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6146       if (child
6147           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6148               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6149           /* Perform usual caching, note code-generation still
6150              code-gens these nodes multiple times but we expect
6151              to CSE them later.  */
6152           && !visited_set.add (child))
6153         {
6154           visited_vec.safe_push (child);
6155           /* ???  After auditing more code paths make a "default"
6156              and push the vector type from NODE to all children
6157              if it is not already set.  */
6158           /* Compute the number of vectors to be generated.  */
6159           tree vector_type = SLP_TREE_VECTYPE (child);
6160           if (!vector_type)
6161             {
6162               /* For shifts with a scalar argument we don't need
6163                  to cost or code-generate anything.
6164                  ???  Represent this more explicitely.  */
6165               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6166                            == shift_vec_info_type)
6167                           && j == 1);
6168               continue;
6169             }
6170           unsigned group_size = SLP_TREE_LANES (child);
6171           poly_uint64 vf = 1;
6172           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6173             vf = loop_vinfo->vectorization_factor;
6174           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6175             = vect_get_num_vectors (vf * group_size, vector_type);
6176           /* And cost them.  */
6177           vect_prologue_cost_for_slp (child, cost_vec);
6178         }
6179
6180   /* If this node or any of its children can't be vectorized, try pruning
6181      the tree here rather than felling the whole thing.  */
6182   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6183     {
6184       /* We'll need to revisit this for invariant costing and number
6185          of vectorized stmt setting.   */
6186       res = true;
6187     }
6188
6189   return res;
6190 }
6191
6192 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6193    region and that can be vectorized using vectorizable_live_operation
6194    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6195    scalar code computing it to be retained.  */
6196
6197 static void
6198 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6199                              slp_instance instance,
6200                              stmt_vector_for_cost *cost_vec,
6201                              hash_set<stmt_vec_info> &svisited,
6202                              hash_set<slp_tree> &visited)
6203 {
6204   if (visited.add (node))
6205     return;
6206
6207   unsigned i;
6208   stmt_vec_info stmt_info;
6209   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6210   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6211     {
6212       if (svisited.contains (stmt_info))
6213         continue;
6214       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6215       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6216           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6217         /* Only the pattern root stmt computes the original scalar value.  */
6218         continue;
6219       bool mark_visited = true;
6220       gimple *orig_stmt = orig_stmt_info->stmt;
6221       ssa_op_iter op_iter;
6222       def_operand_p def_p;
6223       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6224         {
6225           imm_use_iterator use_iter;
6226           gimple *use_stmt;
6227           stmt_vec_info use_stmt_info;
6228           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6229             if (!is_gimple_debug (use_stmt))
6230               {
6231                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6232                 if (!use_stmt_info
6233                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6234                   {
6235                     STMT_VINFO_LIVE_P (stmt_info) = true;
6236                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6237                                                      NULL, node, instance, i,
6238                                                      false, cost_vec))
6239                       /* ???  So we know we can vectorize the live stmt
6240                          from one SLP node.  If we cannot do so from all
6241                          or none consistently we'd have to record which
6242                          SLP node (and lane) we want to use for the live
6243                          operation.  So make sure we can code-generate
6244                          from all nodes.  */
6245                       mark_visited = false;
6246                     else
6247                       STMT_VINFO_LIVE_P (stmt_info) = false;
6248                     break;
6249                   }
6250               }
6251           /* We have to verify whether we can insert the lane extract
6252              before all uses.  The following is a conservative approximation.
6253              We cannot put this into vectorizable_live_operation because
6254              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6255              doesn't work.
6256              Note that while the fact that we emit code for loads at the
6257              first load should make this a non-problem leafs we construct
6258              from scalars are vectorized after the last scalar def.
6259              ???  If we'd actually compute the insert location during
6260              analysis we could use sth less conservative than the last
6261              scalar stmt in the node for the dominance check.  */
6262           /* ???  What remains is "live" uses in vector CTORs in the same
6263              SLP graph which is where those uses can end up code-generated
6264              right after their definition instead of close to their original
6265              use.  But that would restrict us to code-generate lane-extracts
6266              from the latest stmt in a node.  So we compensate for this
6267              during code-generation, simply not replacing uses for those
6268              hopefully rare cases.  */
6269           if (STMT_VINFO_LIVE_P (stmt_info))
6270             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6271               if (!is_gimple_debug (use_stmt)
6272                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6273                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6274                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6275                 {
6276                   if (dump_enabled_p ())
6277                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278                                      "Cannot determine insertion place for "
6279                                      "lane extract\n");
6280                   STMT_VINFO_LIVE_P (stmt_info) = false;
6281                   mark_visited = true;
6282                 }
6283         }
6284       if (mark_visited)
6285         svisited.add (stmt_info);
6286     }
6287
6288   slp_tree child;
6289   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6290     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6291       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6292                                    cost_vec, svisited, visited);
6293 }
6294
6295 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6296
6297 static bool
6298 vectorizable_bb_reduc_epilogue (slp_instance instance,
6299                                 stmt_vector_for_cost *cost_vec)
6300 {
6301   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6302   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6303   if (reduc_code == MINUS_EXPR)
6304     reduc_code = PLUS_EXPR;
6305   internal_fn reduc_fn;
6306   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6307   if (!vectype
6308       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6309       || reduc_fn == IFN_LAST
6310       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6311       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6312                                      TREE_TYPE (vectype)))
6313     return false;
6314
6315   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6316      cost log2 vector operations plus shuffles and one extraction.  */
6317   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6318   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6319                     vectype, 0, vect_body);
6320   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6321                     vectype, 0, vect_body);
6322   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6323                     vectype, 0, vect_body);
6324   return true;
6325 }
6326
6327 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6328    and recurse to children.  */
6329
6330 static void
6331 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6332                               hash_set<slp_tree> &visited)
6333 {
6334   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6335       || visited.add (node))
6336     return;
6337
6338   stmt_vec_info stmt;
6339   unsigned i;
6340   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6341     roots.remove (vect_orig_stmt (stmt));
6342
6343   slp_tree child;
6344   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6345     if (child)
6346       vect_slp_prune_covered_roots (child, roots, visited);
6347 }
6348
6349 /* Analyze statements in SLP instances of VINFO.  Return true if the
6350    operations are supported. */
6351
6352 bool
6353 vect_slp_analyze_operations (vec_info *vinfo)
6354 {
6355   slp_instance instance;
6356   int i;
6357
6358   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6359
6360   hash_set<slp_tree> visited;
6361   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6362     {
6363       auto_vec<slp_tree> visited_vec;
6364       stmt_vector_for_cost cost_vec;
6365       cost_vec.create (2);
6366       if (is_a <bb_vec_info> (vinfo))
6367         vect_location = instance->location ();
6368       if (!vect_slp_analyze_node_operations (vinfo,
6369                                              SLP_INSTANCE_TREE (instance),
6370                                              instance, visited, visited_vec,
6371                                              &cost_vec)
6372           /* CTOR instances require vectorized defs for the SLP tree root.  */
6373           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6374               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6375                   != vect_internal_def
6376                   /* Make sure we vectorized with the expected type.  */
6377                   || !useless_type_conversion_p
6378                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6379                                               (instance->root_stmts[0]->stmt))),
6380                          TREE_TYPE (SLP_TREE_VECTYPE
6381                                             (SLP_INSTANCE_TREE (instance))))))
6382           /* Check we can vectorize the reduction.  */
6383           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6384               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6385         {
6386           slp_tree node = SLP_INSTANCE_TREE (instance);
6387           stmt_vec_info stmt_info;
6388           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6389             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6390           else
6391             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6392           if (dump_enabled_p ())
6393             dump_printf_loc (MSG_NOTE, vect_location,
6394                              "removing SLP instance operations starting from: %G",
6395                              stmt_info->stmt);
6396           vect_free_slp_instance (instance);
6397           vinfo->slp_instances.ordered_remove (i);
6398           cost_vec.release ();
6399           while (!visited_vec.is_empty ())
6400             visited.remove (visited_vec.pop ());
6401         }
6402       else
6403         {
6404           i++;
6405           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6406             {
6407               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6408               cost_vec.release ();
6409             }
6410           else
6411             /* For BB vectorization remember the SLP graph entry
6412                cost for later.  */
6413             instance->cost_vec = cost_vec;
6414         }
6415     }
6416
6417   /* Now look for SLP instances with a root that are covered by other
6418      instances and remove them.  */
6419   hash_set<stmt_vec_info> roots;
6420   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6421     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6422       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6423   if (!roots.is_empty ())
6424     {
6425       visited.empty ();
6426       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6427         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6428                                       visited);
6429       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6430         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6431             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6432           {
6433             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6434             if (dump_enabled_p ())
6435               dump_printf_loc (MSG_NOTE, vect_location,
6436                                "removing SLP instance operations starting "
6437                                "from: %G", root->stmt);
6438             vect_free_slp_instance (instance);
6439             vinfo->slp_instances.ordered_remove (i);
6440           }
6441         else
6442           ++i;
6443     }
6444
6445   /* Compute vectorizable live stmts.  */
6446   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6447     {
6448       hash_set<stmt_vec_info> svisited;
6449       hash_set<slp_tree> visited;
6450       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6451         {
6452           vect_location = instance->location ();
6453           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6454                                        instance, &instance->cost_vec, svisited,
6455                                        visited);
6456         }
6457     }
6458
6459   return !vinfo->slp_instances.is_empty ();
6460 }
6461
6462 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6463    closing the eventual chain.  */
6464
6465 static slp_instance
6466 get_ultimate_leader (slp_instance instance,
6467                      hash_map<slp_instance, slp_instance> &instance_leader)
6468 {
6469   auto_vec<slp_instance *, 8> chain;
6470   slp_instance *tem;
6471   while (*(tem = instance_leader.get (instance)) != instance)
6472     {
6473       chain.safe_push (tem);
6474       instance = *tem;
6475     }
6476   while (!chain.is_empty ())
6477     *chain.pop () = instance;
6478   return instance;
6479 }
6480
6481 namespace {
6482 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6483    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6484    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6485
6486    INSTANCE_LEADER is as for get_ultimate_leader.  */
6487
6488 template<typename T>
6489 bool
6490 vect_map_to_instance (slp_instance instance, T key,
6491                       hash_map<T, slp_instance> &key_to_instance,
6492                       hash_map<slp_instance, slp_instance> &instance_leader)
6493 {
6494   bool existed_p;
6495   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6496   if (!existed_p)
6497     ;
6498   else if (key_instance != instance)
6499     {
6500       /* If we're running into a previously marked key make us the
6501          leader of the current ultimate leader.  This keeps the
6502          leader chain acyclic and works even when the current instance
6503          connects two previously independent graph parts.  */
6504       slp_instance key_leader
6505         = get_ultimate_leader (key_instance, instance_leader);
6506       if (key_leader != instance)
6507         instance_leader.put (key_leader, instance);
6508     }
6509   key_instance = instance;
6510   return existed_p;
6511 }
6512 }
6513
6514 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6515
6516 static void
6517 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6518                            slp_instance instance, slp_tree node,
6519                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6520                            hash_map<slp_tree, slp_instance> &node_to_instance,
6521                            hash_map<slp_instance, slp_instance> &instance_leader)
6522 {
6523   stmt_vec_info stmt_info;
6524   unsigned i;
6525
6526   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6527     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6528                           instance_leader);
6529
6530   if (vect_map_to_instance (instance, node, node_to_instance,
6531                             instance_leader))
6532     return;
6533
6534   slp_tree child;
6535   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6536     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6537       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6538                                  node_to_instance, instance_leader);
6539 }
6540
6541 /* Partition the SLP graph into pieces that can be costed independently.  */
6542
6543 static void
6544 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6545 {
6546   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6547
6548   /* First walk the SLP graph assigning each involved scalar stmt a
6549      corresponding SLP graph entry and upon visiting a previously
6550      marked stmt, make the stmts leader the current SLP graph entry.  */
6551   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6552   hash_map<slp_tree, slp_instance> node_to_instance;
6553   hash_map<slp_instance, slp_instance> instance_leader;
6554   slp_instance instance;
6555   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6556     {
6557       instance_leader.put (instance, instance);
6558       vect_bb_partition_graph_r (bb_vinfo,
6559                                  instance, SLP_INSTANCE_TREE (instance),
6560                                  stmt_to_instance, node_to_instance,
6561                                  instance_leader);
6562     }
6563
6564   /* Then collect entries to each independent subgraph.  */
6565   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6566     {
6567       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6568       leader->subgraph_entries.safe_push (instance);
6569       if (dump_enabled_p ()
6570           && leader != instance)
6571         dump_printf_loc (MSG_NOTE, vect_location,
6572                          "instance %p is leader of %p\n",
6573                          (void *) leader, (void *) instance);
6574     }
6575 }
6576
6577 /* Compute the set of scalar stmts participating in internal and external
6578    nodes.  */
6579
6580 static void
6581 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6582                                          hash_set<slp_tree> &visited,
6583                                          hash_set<stmt_vec_info> &vstmts,
6584                                          hash_set<stmt_vec_info> &estmts)
6585 {
6586   int i;
6587   stmt_vec_info stmt_info;
6588   slp_tree child;
6589
6590   if (visited.add (node))
6591     return;
6592
6593   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6594     {
6595       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6596         vstmts.add (stmt_info);
6597
6598       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6599         if (child)
6600           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6601                                                    vstmts, estmts);
6602     }
6603   else
6604     for (tree def : SLP_TREE_SCALAR_OPS (node))
6605       {
6606         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6607         if (def_stmt)
6608           estmts.add (def_stmt);
6609       }
6610 }
6611
6612
6613 /* Compute the scalar cost of the SLP node NODE and its children
6614    and return it.  Do not account defs that are marked in LIFE and
6615    update LIFE according to uses of NODE.  */
6616
6617 static void
6618 vect_bb_slp_scalar_cost (vec_info *vinfo,
6619                          slp_tree node, vec<bool, va_heap> *life,
6620                          stmt_vector_for_cost *cost_vec,
6621                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6622                          hash_set<slp_tree> &visited)
6623 {
6624   unsigned i;
6625   stmt_vec_info stmt_info;
6626   slp_tree child;
6627
6628   if (visited.add (node))
6629     return;
6630
6631   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6632     {
6633       ssa_op_iter op_iter;
6634       def_operand_p def_p;
6635
6636       if ((*life)[i])
6637         continue;
6638
6639       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6640       gimple *orig_stmt = orig_stmt_info->stmt;
6641
6642       /* If there is a non-vectorized use of the defs then the scalar
6643          stmt is kept live in which case we do not account it or any
6644          required defs in the SLP children in the scalar cost.  This
6645          way we make the vectorization more costly when compared to
6646          the scalar cost.  */
6647       if (!STMT_VINFO_LIVE_P (stmt_info))
6648         {
6649           auto_vec<gimple *, 8> worklist;
6650           hash_set<gimple *> *worklist_visited = NULL;
6651           worklist.quick_push (orig_stmt);
6652           do
6653             {
6654               gimple *work_stmt = worklist.pop ();
6655               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6656                 {
6657                   imm_use_iterator use_iter;
6658                   gimple *use_stmt;
6659                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6660                                          DEF_FROM_PTR (def_p))
6661                     if (!is_gimple_debug (use_stmt))
6662                       {
6663                         stmt_vec_info use_stmt_info
6664                           = vinfo->lookup_stmt (use_stmt);
6665                         if (!use_stmt_info
6666                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6667                           {
6668                             if (use_stmt_info
6669                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6670                               {
6671                                 /* For stmts participating in patterns we have
6672                                    to check its uses recursively.  */
6673                                 if (!worklist_visited)
6674                                   worklist_visited = new hash_set<gimple *> ();
6675                                 if (!worklist_visited->add (use_stmt))
6676                                   worklist.safe_push (use_stmt);
6677                                 continue;
6678                               }
6679                             (*life)[i] = true;
6680                             goto next_lane;
6681                           }
6682                       }
6683                 }
6684             }
6685           while (!worklist.is_empty ());
6686 next_lane:
6687           if (worklist_visited)
6688             delete worklist_visited;
6689           if ((*life)[i])
6690             continue;
6691         }
6692
6693       /* Count scalar stmts only once.  */
6694       if (gimple_visited_p (orig_stmt))
6695         continue;
6696       gimple_set_visited (orig_stmt, true);
6697
6698       vect_cost_for_stmt kind;
6699       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6700         {
6701           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6702             kind = scalar_load;
6703           else
6704             kind = scalar_store;
6705         }
6706       else if (vect_nop_conversion_p (orig_stmt_info))
6707         continue;
6708       /* For single-argument PHIs assume coalescing which means zero cost
6709          for the scalar and the vector PHIs.  This avoids artificially
6710          favoring the vector path (but may pessimize it in some cases).  */
6711       else if (is_a <gphi *> (orig_stmt_info->stmt)
6712                && gimple_phi_num_args
6713                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6714         continue;
6715       else
6716         kind = scalar_stmt;
6717       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6718                         SLP_TREE_VECTYPE (node), 0, vect_body);
6719     }
6720
6721   auto_vec<bool, 20> subtree_life;
6722   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6723     {
6724       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6725         {
6726           /* Do not directly pass LIFE to the recursive call, copy it to
6727              confine changes in the callee to the current child/subtree.  */
6728           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6729             {
6730               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6731               for (unsigned j = 0;
6732                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6733                 {
6734                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6735                   if (perm.first == i)
6736                     subtree_life[perm.second] = (*life)[j];
6737                 }
6738             }
6739           else
6740             {
6741               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6742               subtree_life.safe_splice (*life);
6743             }
6744           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6745                                    vectorized_scalar_stmts, visited);
6746           subtree_life.truncate (0);
6747         }
6748     }
6749 }
6750
6751 /* Comparator for the loop-index sorted cost vectors.  */
6752
6753 static int
6754 li_cost_vec_cmp (const void *a_, const void *b_)
6755 {
6756   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6757   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6758   if (a->first < b->first)
6759     return -1;
6760   else if (a->first == b->first)
6761     return 0;
6762   return 1;
6763 }
6764
6765 /* Check if vectorization of the basic block is profitable for the
6766    subgraph denoted by SLP_INSTANCES.  */
6767
6768 static bool
6769 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6770                                     vec<slp_instance> slp_instances,
6771                                     loop_p orig_loop)
6772 {
6773   slp_instance instance;
6774   int i;
6775   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6776   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6777
6778   if (dump_enabled_p ())
6779     {
6780       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6781       hash_set<slp_tree> visited;
6782       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6783         vect_print_slp_graph (MSG_NOTE, vect_location,
6784                               SLP_INSTANCE_TREE (instance), visited);
6785     }
6786
6787   /* Compute the set of scalar stmts we know will go away 'locally' when
6788      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6789      not accurate for nodes promoted extern late or for scalar stmts that
6790      are used both in extern defs and in vectorized defs.  */
6791   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6792   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6793   hash_set<slp_tree> visited;
6794   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6795     {
6796       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6797                                                SLP_INSTANCE_TREE (instance),
6798                                                visited,
6799                                                vectorized_scalar_stmts,
6800                                                scalar_stmts_in_externs);
6801       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6802         vectorized_scalar_stmts.add (rstmt);
6803     }
6804   /* Scalar stmts used as defs in external nodes need to be preseved, so
6805      remove them from vectorized_scalar_stmts.  */
6806   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6807     vectorized_scalar_stmts.remove (stmt);
6808
6809   /* Calculate scalar cost and sum the cost for the vector stmts
6810      previously collected.  */
6811   stmt_vector_for_cost scalar_costs = vNULL;
6812   stmt_vector_for_cost vector_costs = vNULL;
6813   visited.empty ();
6814   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6815     {
6816       auto_vec<bool, 20> life;
6817       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6818                               true);
6819       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6820         record_stmt_cost (&scalar_costs,
6821                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6822                           scalar_stmt,
6823                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6824       vect_bb_slp_scalar_cost (bb_vinfo,
6825                                SLP_INSTANCE_TREE (instance),
6826                                &life, &scalar_costs, vectorized_scalar_stmts,
6827                                visited);
6828       vector_costs.safe_splice (instance->cost_vec);
6829       instance->cost_vec.release ();
6830     }
6831
6832   if (dump_enabled_p ())
6833     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6834
6835   /* When costing non-loop vectorization we need to consider each covered
6836      loop independently and make sure vectorization is profitable.  For
6837      now we assume a loop may be not entered or executed an arbitrary
6838      number of iterations (???  static information can provide more
6839      precise info here) which means we can simply cost each containing
6840      loops stmts separately.  */
6841
6842   /* First produce cost vectors sorted by loop index.  */
6843   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6844     li_scalar_costs (scalar_costs.length ());
6845   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6846     li_vector_costs (vector_costs.length ());
6847   stmt_info_for_cost *cost;
6848   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6849     {
6850       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6851       li_scalar_costs.quick_push (std::make_pair (l, cost));
6852     }
6853   /* Use a random used loop as fallback in case the first vector_costs
6854      entry does not have a stmt_info associated with it.  */
6855   unsigned l = li_scalar_costs[0].first;
6856   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6857     {
6858       /* We inherit from the previous COST, invariants, externals and
6859          extracts immediately follow the cost for the related stmt.  */
6860       if (cost->stmt_info)
6861         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6862       li_vector_costs.quick_push (std::make_pair (l, cost));
6863     }
6864   li_scalar_costs.qsort (li_cost_vec_cmp);
6865   li_vector_costs.qsort (li_cost_vec_cmp);
6866
6867   /* Now cost the portions individually.  */
6868   unsigned vi = 0;
6869   unsigned si = 0;
6870   bool profitable = true;
6871   while (si < li_scalar_costs.length ()
6872          && vi < li_vector_costs.length ())
6873     {
6874       unsigned sl = li_scalar_costs[si].first;
6875       unsigned vl = li_vector_costs[vi].first;
6876       if (sl != vl)
6877         {
6878           if (dump_enabled_p ())
6879             dump_printf_loc (MSG_NOTE, vect_location,
6880                              "Scalar %d and vector %d loop part do not "
6881                              "match up, skipping scalar part\n", sl, vl);
6882           /* Skip the scalar part, assuming zero cost on the vector side.  */
6883           do
6884             {
6885               si++;
6886             }
6887           while (si < li_scalar_costs.length ()
6888                  && li_scalar_costs[si].first == sl);
6889           continue;
6890         }
6891
6892       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6893       do
6894         {
6895           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6896           si++;
6897         }
6898       while (si < li_scalar_costs.length ()
6899              && li_scalar_costs[si].first == sl);
6900       unsigned dummy;
6901       finish_cost (scalar_target_cost_data, nullptr,
6902                    &dummy, &scalar_cost, &dummy);
6903
6904       /* Complete the target-specific vector cost calculation.  */
6905       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6906       do
6907         {
6908           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6909           vi++;
6910         }
6911       while (vi < li_vector_costs.length ()
6912              && li_vector_costs[vi].first == vl);
6913       finish_cost (vect_target_cost_data, scalar_target_cost_data,
6914                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6915       delete scalar_target_cost_data;
6916       delete vect_target_cost_data;
6917
6918       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6919
6920       if (dump_enabled_p ())
6921         {
6922           dump_printf_loc (MSG_NOTE, vect_location,
6923                            "Cost model analysis for part in loop %d:\n", sl);
6924           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
6925                        vec_inside_cost + vec_outside_cost);
6926           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
6927         }
6928
6929       /* Vectorization is profitable if its cost is more than the cost of scalar
6930          version.  Note that we err on the vector side for equal cost because
6931          the cost estimate is otherwise quite pessimistic (constant uses are
6932          free on the scalar side but cost a load on the vector side for
6933          example).  */
6934       if (vec_outside_cost + vec_inside_cost > scalar_cost)
6935         {
6936           profitable = false;
6937           break;
6938         }
6939     }
6940   if (profitable && vi < li_vector_costs.length ())
6941     {
6942       if (dump_enabled_p ())
6943         dump_printf_loc (MSG_NOTE, vect_location,
6944                          "Excess vector cost for part in loop %d:\n",
6945                          li_vector_costs[vi].first);
6946       profitable = false;
6947     }
6948
6949   /* Unset visited flag.  This is delayed when the subgraph is profitable
6950      and we process the loop for remaining unvectorized if-converted code.  */
6951   if (!orig_loop || !profitable)
6952     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6953       gimple_set_visited  (cost->stmt_info->stmt, false);
6954
6955   scalar_costs.release ();
6956   vector_costs.release ();
6957
6958   return profitable;
6959 }
6960
6961 /* qsort comparator for lane defs.  */
6962
6963 static int
6964 vld_cmp (const void *a_, const void *b_)
6965 {
6966   auto *a = (const std::pair<unsigned, tree> *)a_;
6967   auto *b = (const std::pair<unsigned, tree> *)b_;
6968   return a->first - b->first;
6969 }
6970
6971 /* Return true if USE_STMT is a vector lane insert into VEC and set
6972    *THIS_LANE to the lane number that is set.  */
6973
6974 static bool
6975 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
6976 {
6977   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
6978   if (!use_ass
6979       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
6980       || (vec
6981           ? gimple_assign_rhs1 (use_ass) != vec
6982           : ((vec = gimple_assign_rhs1 (use_ass)), false))
6983       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
6984                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
6985       || !constant_multiple_p
6986             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
6987              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
6988              this_lane))
6989     return false;
6990   return true;
6991 }
6992
6993 /* Find any vectorizable constructors and add them to the grouped_store
6994    array.  */
6995
6996 static void
6997 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
6998 {
6999   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7000     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7001          !gsi_end_p (gsi); gsi_next (&gsi))
7002     {
7003       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7004       if (!assign)
7005         continue;
7006
7007       tree rhs = gimple_assign_rhs1 (assign);
7008       enum tree_code code = gimple_assign_rhs_code (assign);
7009       use_operand_p use_p;
7010       gimple *use_stmt;
7011       if (code == CONSTRUCTOR)
7012         {
7013           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7014               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7015                            CONSTRUCTOR_NELTS (rhs))
7016               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7017               || uniform_vector_p (rhs))
7018             continue;
7019
7020           unsigned j;
7021           tree val;
7022           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7023               if (TREE_CODE (val) != SSA_NAME
7024                   || !bb_vinfo->lookup_def (val))
7025                 break;
7026           if (j != CONSTRUCTOR_NELTS (rhs))
7027             continue;
7028
7029           stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7030           BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7031         }
7032       else if (code == BIT_INSERT_EXPR
7033                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7034                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7035                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7036                && integer_zerop (gimple_assign_rhs3 (assign))
7037                && useless_type_conversion_p
7038                     (TREE_TYPE (TREE_TYPE (rhs)),
7039                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7040                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7041         {
7042           /* We start to match on insert to lane zero but since the
7043              inserts need not be ordered we'd have to search both
7044              the def and the use chains.  */
7045           tree vectype = TREE_TYPE (rhs);
7046           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7047           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7048           auto_sbitmap lanes (nlanes);
7049           bitmap_clear (lanes);
7050           bitmap_set_bit (lanes, 0);
7051           tree def = gimple_assign_lhs (assign);
7052           lane_defs.quick_push
7053                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7054           unsigned lanes_found = 1;
7055           /* Start with the use chains, the last stmt will be the root.  */
7056           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7057           vec<stmt_vec_info> roots = vNULL;
7058           roots.safe_push (last);
7059           do
7060             {
7061               use_operand_p use_p;
7062               gimple *use_stmt;
7063               if (!single_imm_use (def, &use_p, &use_stmt))
7064                 break;
7065               unsigned this_lane;
7066               if (!bb_vinfo->lookup_stmt (use_stmt)
7067                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7068                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7069                 break;
7070               if (bitmap_bit_p (lanes, this_lane))
7071                 break;
7072               lanes_found++;
7073               bitmap_set_bit (lanes, this_lane);
7074               gassign *use_ass = as_a <gassign *> (use_stmt);
7075               lane_defs.quick_push (std::make_pair
7076                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7077               last = bb_vinfo->lookup_stmt (use_ass);
7078               roots.safe_push (last);
7079               def = gimple_assign_lhs (use_ass);
7080             }
7081           while (lanes_found < nlanes);
7082           if (roots.length () > 1)
7083             std::swap(roots[0], roots[roots.length () - 1]);
7084           if (lanes_found < nlanes)
7085             {
7086               /* Now search the def chain.  */
7087               def = gimple_assign_rhs1 (assign);
7088               do
7089                 {
7090                   if (TREE_CODE (def) != SSA_NAME
7091                       || !has_single_use (def))
7092                     break;
7093                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7094                   unsigned this_lane;
7095                   if (!bb_vinfo->lookup_stmt (def_stmt)
7096                       || !vect_slp_is_lane_insert (def_stmt,
7097                                                    NULL_TREE, &this_lane)
7098                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7099                     break;
7100                   if (bitmap_bit_p (lanes, this_lane))
7101                     break;
7102                   lanes_found++;
7103                   bitmap_set_bit (lanes, this_lane);
7104                   lane_defs.quick_push (std::make_pair
7105                                           (this_lane,
7106                                            gimple_assign_rhs2 (def_stmt)));
7107                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7108                   def = gimple_assign_rhs1 (def_stmt);
7109                 }
7110               while (lanes_found < nlanes);
7111             }
7112           if (lanes_found == nlanes)
7113             {
7114               /* Sort lane_defs after the lane index and register the root.  */
7115               lane_defs.qsort (vld_cmp);
7116               vec<stmt_vec_info> stmts;
7117               stmts.create (nlanes);
7118               for (unsigned i = 0; i < nlanes; ++i)
7119                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7120               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7121                                                    stmts, roots));
7122             }
7123           else
7124             roots.release ();
7125         }
7126       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7127                && (associative_tree_code (code) || code == MINUS_EXPR)
7128                /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
7129                   checks pessimize a two-element reduction.  PR54400.
7130                   ???  In-order reduction could be handled if we only
7131                   traverse one operand chain in vect_slp_linearize_chain.  */
7132                && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7133                    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7134                        && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7135                /* Ops with constants at the tail can be stripped here.  */
7136                && TREE_CODE (rhs) == SSA_NAME
7137                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7138                /* Should be the chain end.  */
7139                && (!single_imm_use (gimple_assign_lhs (assign),
7140                                     &use_p, &use_stmt)
7141                    || !is_gimple_assign (use_stmt)
7142                    || (gimple_assign_rhs_code (use_stmt) != code
7143                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7144                            || (gimple_assign_rhs_code (use_stmt)
7145                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7146         {
7147           /* We start the match at the end of a possible association
7148              chain.  */
7149           auto_vec<chain_op_t> chain;
7150           auto_vec<std::pair<tree_code, gimple *> > worklist;
7151           auto_vec<gimple *> chain_stmts;
7152           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7153           if (code == MINUS_EXPR)
7154             code = PLUS_EXPR;
7155           internal_fn reduc_fn;
7156           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7157               || reduc_fn == IFN_LAST)
7158             continue;
7159           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7160                                     /* ??? */
7161                                     code_stmt, alt_code_stmt, &chain_stmts);
7162           if (chain.length () > 1)
7163             {
7164               /* Sort the chain according to def_type and operation.  */
7165               chain.sort (dt_sort_cmp, bb_vinfo);
7166               /* ???  Now we'd want to strip externals and constants
7167                  but record those to be handled in the epilogue.  */
7168               /* ???  For now do not allow mixing ops or externs/constants.  */
7169               bool invalid = false;
7170               for (unsigned i = 0; i < chain.length (); ++i)
7171                 if (chain[i].dt != vect_internal_def
7172                     || chain[i].code != code)
7173                   invalid = true;
7174               if (!invalid)
7175                 {
7176                   vec<stmt_vec_info> stmts;
7177                   stmts.create (chain.length ());
7178                   for (unsigned i = 0; i < chain.length (); ++i)
7179                     stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7180                   vec<stmt_vec_info> roots;
7181                   roots.create (chain_stmts.length ());
7182                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7183                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7184                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7185                                                        stmts, roots));
7186                 }
7187             }
7188         }
7189     }
7190 }
7191
7192 /* Walk the grouped store chains and replace entries with their
7193    pattern variant if any.  */
7194
7195 static void
7196 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7197 {
7198   stmt_vec_info first_element;
7199   unsigned i;
7200
7201   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7202     {
7203       /* We also have CTORs in this array.  */
7204       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7205         continue;
7206       if (STMT_VINFO_IN_PATTERN_P (first_element))
7207         {
7208           stmt_vec_info orig = first_element;
7209           first_element = STMT_VINFO_RELATED_STMT (first_element);
7210           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7211           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7212           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7213           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7214           vinfo->grouped_stores[i] = first_element;
7215         }
7216       stmt_vec_info prev = first_element;
7217       while (DR_GROUP_NEXT_ELEMENT (prev))
7218         {
7219           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7220           if (STMT_VINFO_IN_PATTERN_P (elt))
7221             {
7222               stmt_vec_info orig = elt;
7223               elt = STMT_VINFO_RELATED_STMT (elt);
7224               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7225               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7226               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7227             }
7228           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7229           prev = elt;
7230         }
7231     }
7232 }
7233
7234 /* Check if the region described by BB_VINFO can be vectorized, returning
7235    true if so.  When returning false, set FATAL to true if the same failure
7236    would prevent vectorization at other vector sizes, false if it is still
7237    worth trying other sizes.  N_STMTS is the number of statements in the
7238    region.  */
7239
7240 static bool
7241 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7242                        vec<int> *dataref_groups)
7243 {
7244   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7245
7246   slp_instance instance;
7247   int i;
7248   poly_uint64 min_vf = 2;
7249
7250   /* The first group of checks is independent of the vector size.  */
7251   fatal = true;
7252
7253   /* Analyze the data references.  */
7254
7255   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7256     {
7257       if (dump_enabled_p ())
7258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7259                          "not vectorized: unhandled data-ref in basic "
7260                          "block.\n");
7261       return false;
7262     }
7263
7264   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7265     {
7266      if (dump_enabled_p ())
7267        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268                         "not vectorized: unhandled data access in "
7269                         "basic block.\n");
7270       return false;
7271     }
7272
7273   vect_slp_check_for_constructors (bb_vinfo);
7274
7275   /* If there are no grouped stores and no constructors in the region
7276      there is no need to continue with pattern recog as vect_analyze_slp
7277      will fail anyway.  */
7278   if (bb_vinfo->grouped_stores.is_empty ()
7279       && bb_vinfo->roots.is_empty ())
7280     {
7281       if (dump_enabled_p ())
7282         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7283                          "not vectorized: no grouped stores in "
7284                          "basic block.\n");
7285       return false;
7286     }
7287
7288   /* While the rest of the analysis below depends on it in some way.  */
7289   fatal = false;
7290
7291   vect_pattern_recog (bb_vinfo);
7292
7293   /* Update store groups from pattern processing.  */
7294   vect_fixup_store_groups_with_patterns (bb_vinfo);
7295
7296   /* Check the SLP opportunities in the basic block, analyze and build SLP
7297      trees.  */
7298   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7299     {
7300       if (dump_enabled_p ())
7301         {
7302           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7303                            "Failed to SLP the basic block.\n");
7304           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7305                            "not vectorized: failed to find SLP opportunities "
7306                            "in basic block.\n");
7307         }
7308       return false;
7309     }
7310
7311   /* Optimize permutations.  */
7312   vect_optimize_slp (bb_vinfo);
7313
7314   /* Gather the loads reachable from the SLP graph entries.  */
7315   vect_gather_slp_loads (bb_vinfo);
7316
7317   vect_record_base_alignments (bb_vinfo);
7318
7319   /* Analyze and verify the alignment of data references and the
7320      dependence in the SLP instances.  */
7321   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7322     {
7323       vect_location = instance->location ();
7324       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7325           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7326         {
7327           slp_tree node = SLP_INSTANCE_TREE (instance);
7328           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7329           if (dump_enabled_p ())
7330             dump_printf_loc (MSG_NOTE, vect_location,
7331                              "removing SLP instance operations starting from: %G",
7332                              stmt_info->stmt);
7333           vect_free_slp_instance (instance);
7334           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7335           continue;
7336         }
7337
7338       /* Mark all the statements that we want to vectorize as pure SLP and
7339          relevant.  */
7340       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7341       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7342       unsigned j;
7343       stmt_vec_info root;
7344       /* Likewise consider instance root stmts as vectorized.  */
7345       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7346         STMT_SLP_TYPE (root) = pure_slp;
7347
7348       i++;
7349     }
7350   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7351     return false;
7352
7353   if (!vect_slp_analyze_operations (bb_vinfo))
7354     {
7355       if (dump_enabled_p ())
7356         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357                          "not vectorized: bad operation in basic block.\n");
7358       return false;
7359     }
7360
7361   vect_bb_partition_graph (bb_vinfo);
7362
7363   return true;
7364 }
7365
7366 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7367    basic blocks in BBS, returning true on success.
7368    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7369
7370 static bool
7371 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7372                  vec<int> *dataref_groups, unsigned int n_stmts,
7373                  loop_p orig_loop)
7374 {
7375   bb_vec_info bb_vinfo;
7376   auto_vector_modes vector_modes;
7377
7378   /* Autodetect first vector size we try.  */
7379   machine_mode next_vector_mode = VOIDmode;
7380   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7381   unsigned int mode_i = 0;
7382
7383   vec_info_shared shared;
7384
7385   machine_mode autodetected_vector_mode = VOIDmode;
7386   while (1)
7387     {
7388       bool vectorized = false;
7389       bool fatal = false;
7390       bb_vinfo = new _bb_vec_info (bbs, &shared);
7391
7392       bool first_time_p = shared.datarefs.is_empty ();
7393       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7394       if (first_time_p)
7395         bb_vinfo->shared->save_datarefs ();
7396       else
7397         bb_vinfo->shared->check_datarefs ();
7398       bb_vinfo->vector_mode = next_vector_mode;
7399
7400       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7401         {
7402           if (dump_enabled_p ())
7403             {
7404               dump_printf_loc (MSG_NOTE, vect_location,
7405                                "***** Analysis succeeded with vector mode"
7406                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7407               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7408             }
7409
7410           bb_vinfo->shared->check_datarefs ();
7411
7412           auto_vec<slp_instance> profitable_subgraphs;
7413           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7414             {
7415               if (instance->subgraph_entries.is_empty ())
7416                 continue;
7417
7418               vect_location = instance->location ();
7419               if (!unlimited_cost_model (NULL)
7420                   && !vect_bb_vectorization_profitable_p
7421                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7422                 {
7423                   if (dump_enabled_p ())
7424                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7425                                      "not vectorized: vectorization is not "
7426                                      "profitable.\n");
7427                   continue;
7428                 }
7429
7430               if (!dbg_cnt (vect_slp))
7431                 continue;
7432
7433               profitable_subgraphs.safe_push (instance);
7434             }
7435
7436           /* When we're vectorizing an if-converted loop body make sure
7437              we vectorized all if-converted code.  */
7438           if (!profitable_subgraphs.is_empty ()
7439               && orig_loop)
7440             {
7441               gcc_assert (bb_vinfo->bbs.length () == 1);
7442               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7443                    !gsi_end_p (gsi); gsi_next (&gsi))
7444                 {
7445                   /* The costing above left us with DCEable vectorized scalar
7446                      stmts having the visited flag set on profitable
7447                      subgraphs.  Do the delayed clearing of the flag here.  */
7448                   if (gimple_visited_p (gsi_stmt (gsi)))
7449                     {
7450                       gimple_set_visited (gsi_stmt (gsi), false);
7451                       continue;
7452                     }
7453                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7454                     continue;
7455
7456                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7457                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7458                       {
7459                         if (!profitable_subgraphs.is_empty ()
7460                             && dump_enabled_p ())
7461                           dump_printf_loc (MSG_NOTE, vect_location,
7462                                            "not profitable because of "
7463                                            "unprofitable if-converted scalar "
7464                                            "code\n");
7465                         profitable_subgraphs.truncate (0);
7466                       }
7467                 }
7468             }
7469
7470           /* Finally schedule the profitable subgraphs.  */
7471           for (slp_instance instance : profitable_subgraphs)
7472             {
7473               if (!vectorized && dump_enabled_p ())
7474                 dump_printf_loc (MSG_NOTE, vect_location,
7475                                  "Basic block will be vectorized "
7476                                  "using SLP\n");
7477               vectorized = true;
7478
7479               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7480
7481               unsigned HOST_WIDE_INT bytes;
7482               if (dump_enabled_p ())
7483                 {
7484                   if (GET_MODE_SIZE
7485                         (bb_vinfo->vector_mode).is_constant (&bytes))
7486                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7487                                      "basic block part vectorized using %wu "
7488                                      "byte vectors\n", bytes);
7489                   else
7490                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7491                                      "basic block part vectorized using "
7492                                      "variable length vectors\n");
7493                 }
7494             }
7495         }
7496       else
7497         {
7498           if (dump_enabled_p ())
7499             dump_printf_loc (MSG_NOTE, vect_location,
7500                              "***** Analysis failed with vector mode %s\n",
7501                              GET_MODE_NAME (bb_vinfo->vector_mode));
7502         }
7503
7504       if (mode_i == 0)
7505         autodetected_vector_mode = bb_vinfo->vector_mode;
7506
7507       if (!fatal)
7508         while (mode_i < vector_modes.length ()
7509                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7510           {
7511             if (dump_enabled_p ())
7512               dump_printf_loc (MSG_NOTE, vect_location,
7513                                "***** The result for vector mode %s would"
7514                                " be the same\n",
7515                                GET_MODE_NAME (vector_modes[mode_i]));
7516             mode_i += 1;
7517           }
7518
7519       delete bb_vinfo;
7520
7521       if (mode_i < vector_modes.length ()
7522           && VECTOR_MODE_P (autodetected_vector_mode)
7523           && (related_vector_mode (vector_modes[mode_i],
7524                                    GET_MODE_INNER (autodetected_vector_mode))
7525               == autodetected_vector_mode)
7526           && (related_vector_mode (autodetected_vector_mode,
7527                                    GET_MODE_INNER (vector_modes[mode_i]))
7528               == vector_modes[mode_i]))
7529         {
7530           if (dump_enabled_p ())
7531             dump_printf_loc (MSG_NOTE, vect_location,
7532                              "***** Skipping vector mode %s, which would"
7533                              " repeat the analysis for %s\n",
7534                              GET_MODE_NAME (vector_modes[mode_i]),
7535                              GET_MODE_NAME (autodetected_vector_mode));
7536           mode_i += 1;
7537         }
7538
7539       if (vectorized
7540           || mode_i == vector_modes.length ()
7541           || autodetected_vector_mode == VOIDmode
7542           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7543              vector sizes will fail do not bother iterating.  */
7544           || fatal)
7545         return vectorized;
7546
7547       /* Try the next biggest vector size.  */
7548       next_vector_mode = vector_modes[mode_i++];
7549       if (dump_enabled_p ())
7550         dump_printf_loc (MSG_NOTE, vect_location,
7551                          "***** Re-trying analysis with vector mode %s\n",
7552                          GET_MODE_NAME (next_vector_mode));
7553     }
7554 }
7555
7556
7557 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7558    true if anything in the basic-block was vectorized.  */
7559
7560 static bool
7561 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7562 {
7563   vec<data_reference_p> datarefs = vNULL;
7564   auto_vec<int> dataref_groups;
7565   int insns = 0;
7566   int current_group = 0;
7567
7568   for (unsigned i = 0; i < bbs.length (); i++)
7569     {
7570       basic_block bb = bbs[i];
7571       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7572            gsi_next (&gsi))
7573         {
7574           gimple *stmt = gsi_stmt (gsi);
7575           if (is_gimple_debug (stmt))
7576             continue;
7577
7578           insns++;
7579
7580           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7581             vect_location = stmt;
7582
7583           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7584                                               &dataref_groups, current_group))
7585             ++current_group;
7586         }
7587       /* New BBs always start a new DR group.  */
7588       ++current_group;
7589     }
7590
7591   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7592 }
7593
7594 /* Special entry for the BB vectorizer.  Analyze and transform a single
7595    if-converted BB with ORIG_LOOPs body being the not if-converted
7596    representation.  Returns true if anything in the basic-block was
7597    vectorized.  */
7598
7599 bool
7600 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7601 {
7602   auto_vec<basic_block> bbs;
7603   bbs.safe_push (bb);
7604   return vect_slp_bbs (bbs, orig_loop);
7605 }
7606
7607 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7608    true if anything in the basic-block was vectorized.  */
7609
7610 bool
7611 vect_slp_function (function *fun)
7612 {
7613   bool r = false;
7614   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7615   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7616
7617   /* For the moment split the function into pieces to avoid making
7618      the iteration on the vector mode moot.  Split at points we know
7619      to not handle well which is CFG merges (SLP discovery doesn't
7620      handle non-loop-header PHIs) and loop exits.  Since pattern
7621      recog requires reverse iteration to visit uses before defs
7622      simply chop RPO into pieces.  */
7623   auto_vec<basic_block> bbs;
7624   for (unsigned i = 0; i < n; i++)
7625     {
7626       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7627       bool split = false;
7628
7629       /* Split when a BB is not dominated by the first block.  */
7630       if (!bbs.is_empty ()
7631           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7632         {
7633           if (dump_enabled_p ())
7634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7635                              "splitting region at dominance boundary bb%d\n",
7636                              bb->index);
7637           split = true;
7638         }
7639       /* Split when the loop determined by the first block
7640          is exited.  This is because we eventually insert
7641          invariants at region begin.  */
7642       else if (!bbs.is_empty ()
7643                && bbs[0]->loop_father != bb->loop_father
7644                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7645         {
7646           if (dump_enabled_p ())
7647             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7648                              "splitting region at loop %d exit at bb%d\n",
7649                              bbs[0]->loop_father->num, bb->index);
7650           split = true;
7651         }
7652
7653       if (split && !bbs.is_empty ())
7654         {
7655           r |= vect_slp_bbs (bbs, NULL);
7656           bbs.truncate (0);
7657           bbs.quick_push (bb);
7658         }
7659       else
7660         bbs.safe_push (bb);
7661
7662       /* When we have a stmt ending this block and defining a
7663          value we have to insert on edges when inserting after it for
7664          a vector containing its definition.  Avoid this for now.  */
7665       if (gimple *last = last_stmt (bb))
7666         if (gimple_get_lhs (last)
7667             && is_ctrl_altering_stmt (last))
7668           {
7669             if (dump_enabled_p ())
7670               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7671                                "splitting region at control altering "
7672                                "definition %G", last);
7673             r |= vect_slp_bbs (bbs, NULL);
7674             bbs.truncate (0);
7675           }
7676     }
7677
7678   if (!bbs.is_empty ())
7679     r |= vect_slp_bbs (bbs, NULL);
7680
7681   free (rpo);
7682
7683   return r;
7684 }
7685
7686 /* Build a variable-length vector in which the elements in ELTS are repeated
7687    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7688    RESULTS and add any new instructions to SEQ.
7689
7690    The approach we use is:
7691
7692    (1) Find a vector mode VM with integer elements of mode IM.
7693
7694    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7695        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7696        from small vectors to IM.
7697
7698    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7699
7700    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7701        correct byte contents.
7702
7703    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7704
7705    We try to find the largest IM for which this sequence works, in order
7706    to cut down on the number of interleaves.  */
7707
7708 void
7709 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7710                           const vec<tree> &elts, unsigned int nresults,
7711                           vec<tree> &results)
7712 {
7713   unsigned int nelts = elts.length ();
7714   tree element_type = TREE_TYPE (vector_type);
7715
7716   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7717   unsigned int nvectors = 1;
7718   tree new_vector_type;
7719   tree permutes[2];
7720   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7721                                        &nvectors, &new_vector_type,
7722                                        permutes))
7723     gcc_unreachable ();
7724
7725   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7726   unsigned int partial_nelts = nelts / nvectors;
7727   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7728
7729   tree_vector_builder partial_elts;
7730   auto_vec<tree, 32> pieces (nvectors * 2);
7731   pieces.quick_grow_cleared (nvectors * 2);
7732   for (unsigned int i = 0; i < nvectors; ++i)
7733     {
7734       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7735              ELTS' has mode IM.  */
7736       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7737       for (unsigned int j = 0; j < partial_nelts; ++j)
7738         partial_elts.quick_push (elts[i * partial_nelts + j]);
7739       tree t = gimple_build_vector (seq, &partial_elts);
7740       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7741                         TREE_TYPE (new_vector_type), t);
7742
7743       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7744       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7745     }
7746
7747   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7748          correct byte contents.
7749
7750      Conceptually, we need to repeat the following operation log2(nvectors)
7751      times, where hi_start = nvectors / 2:
7752
7753         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7754         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7755
7756      However, if each input repeats every N elements and the VF is
7757      a multiple of N * 2, the HI result is the same as the LO result.
7758      This will be true for the first N1 iterations of the outer loop,
7759      followed by N2 iterations for which both the LO and HI results
7760      are needed.  I.e.:
7761
7762         N1 + N2 = log2(nvectors)
7763
7764      Each "N1 iteration" doubles the number of redundant vectors and the
7765      effect of the process as a whole is to have a sequence of nvectors/2**N1
7766      vectors that repeats 2**N1 times.  Rather than generate these redundant
7767      vectors, we halve the number of vectors for each N1 iteration.  */
7768   unsigned int in_start = 0;
7769   unsigned int out_start = nvectors;
7770   unsigned int new_nvectors = nvectors;
7771   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7772     {
7773       unsigned int hi_start = new_nvectors / 2;
7774       unsigned int out_i = 0;
7775       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7776         {
7777           if ((in_i & 1) != 0
7778               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7779                              2 * in_repeat))
7780             continue;
7781
7782           tree output = make_ssa_name (new_vector_type);
7783           tree input1 = pieces[in_start + (in_i / 2)];
7784           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7785           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7786                                                input1, input2,
7787                                                permutes[in_i & 1]);
7788           gimple_seq_add_stmt (seq, stmt);
7789           pieces[out_start + out_i] = output;
7790           out_i += 1;
7791         }
7792       std::swap (in_start, out_start);
7793       new_nvectors = out_i;
7794     }
7795
7796   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7797   results.reserve (nresults);
7798   for (unsigned int i = 0; i < nresults; ++i)
7799     if (i < new_nvectors)
7800       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7801                                         pieces[in_start + i]));
7802     else
7803       results.quick_push (results[i - new_nvectors]);
7804 }
7805
7806
7807 /* For constant and loop invariant defs in OP_NODE this function creates
7808    vector defs that will be used in the vectorized stmts and stores them
7809    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7810
7811 static void
7812 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7813 {
7814   unsigned HOST_WIDE_INT nunits;
7815   tree vec_cst;
7816   unsigned j, number_of_places_left_in_vector;
7817   tree vector_type;
7818   tree vop;
7819   int group_size = op_node->ops.length ();
7820   unsigned int vec_num, i;
7821   unsigned number_of_copies = 1;
7822   bool constant_p;
7823   gimple_seq ctor_seq = NULL;
7824   auto_vec<tree, 16> permute_results;
7825
7826   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
7827   vector_type = SLP_TREE_VECTYPE (op_node);
7828
7829   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7830   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7831   auto_vec<tree> voprnds (number_of_vectors);
7832
7833   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7834      created vectors. It is greater than 1 if unrolling is performed.
7835
7836      For example, we have two scalar operands, s1 and s2 (e.g., group of
7837      strided accesses of size two), while NUNITS is four (i.e., four scalars
7838      of this type can be packed in a vector).  The output vector will contain
7839      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
7840      will be 2).
7841
7842      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7843      containing the operands.
7844
7845      For example, NUNITS is four as before, and the group size is 8
7846      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
7847      {s5, s6, s7, s8}.  */
7848
7849   /* When using duplicate_and_interleave, we just need one element for
7850      each scalar statement.  */
7851   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7852     nunits = group_size;
7853
7854   number_of_copies = nunits * number_of_vectors / group_size;
7855
7856   number_of_places_left_in_vector = nunits;
7857   constant_p = true;
7858   tree_vector_builder elts (vector_type, nunits, 1);
7859   elts.quick_grow (nunits);
7860   stmt_vec_info insert_after = NULL;
7861   for (j = 0; j < number_of_copies; j++)
7862     {
7863       tree op;
7864       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7865         {
7866           /* Create 'vect_ = {op0,op1,...,opn}'.  */
7867           number_of_places_left_in_vector--;
7868           tree orig_op = op;
7869           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7870             {
7871               if (CONSTANT_CLASS_P (op))
7872                 {
7873                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7874                     {
7875                       /* Can't use VIEW_CONVERT_EXPR for booleans because
7876                          of possibly different sizes of scalar value and
7877                          vector element.  */
7878                       if (integer_zerop (op))
7879                         op = build_int_cst (TREE_TYPE (vector_type), 0);
7880                       else if (integer_onep (op))
7881                         op = build_all_ones_cst (TREE_TYPE (vector_type));
7882                       else
7883                         gcc_unreachable ();
7884                     }
7885                   else
7886                     op = fold_unary (VIEW_CONVERT_EXPR,
7887                                      TREE_TYPE (vector_type), op);
7888                   gcc_assert (op && CONSTANT_CLASS_P (op));
7889                 }
7890               else
7891                 {
7892                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7893                   gimple *init_stmt;
7894                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7895                     {
7896                       tree true_val
7897                         = build_all_ones_cst (TREE_TYPE (vector_type));
7898                       tree false_val
7899                         = build_zero_cst (TREE_TYPE (vector_type));
7900                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7901                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7902                                                        op, true_val,
7903                                                        false_val);
7904                     }
7905                   else
7906                     {
7907                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7908                                    op);
7909                       init_stmt
7910                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7911                                                op);
7912                     }
7913                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
7914                   op = new_temp;
7915                 }
7916             }
7917           elts[number_of_places_left_in_vector] = op;
7918           if (!CONSTANT_CLASS_P (op))
7919             constant_p = false;
7920           /* For BB vectorization we have to compute an insert location
7921              when a def is inside the analyzed region since we cannot
7922              simply insert at the BB start in this case.  */
7923           stmt_vec_info opdef;
7924           if (TREE_CODE (orig_op) == SSA_NAME
7925               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7926               && is_a <bb_vec_info> (vinfo)
7927               && (opdef = vinfo->lookup_def (orig_op)))
7928             {
7929               if (!insert_after)
7930                 insert_after = opdef;
7931               else
7932                 insert_after = get_later_stmt (insert_after, opdef);
7933             }
7934
7935           if (number_of_places_left_in_vector == 0)
7936             {
7937               if (constant_p
7938                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7939                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7940                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7941               else
7942                 {
7943                   if (permute_results.is_empty ())
7944                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7945                                               elts, number_of_vectors,
7946                                               permute_results);
7947                   vec_cst = permute_results[number_of_vectors - j - 1];
7948                 }
7949               if (!gimple_seq_empty_p (ctor_seq))
7950                 {
7951                   if (insert_after)
7952                     {
7953                       gimple_stmt_iterator gsi;
7954                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7955                         {
7956                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7957                           gsi_insert_seq_before (&gsi, ctor_seq,
7958                                                  GSI_CONTINUE_LINKING);
7959                         }
7960                       else if (!stmt_ends_bb_p (insert_after->stmt))
7961                         {
7962                           gsi = gsi_for_stmt (insert_after->stmt);
7963                           gsi_insert_seq_after (&gsi, ctor_seq,
7964                                                 GSI_CONTINUE_LINKING);
7965                         }
7966                       else
7967                         {
7968                           /* When we want to insert after a def where the
7969                              defining stmt throws then insert on the fallthru
7970                              edge.  */
7971                           edge e = find_fallthru_edge
7972                                      (gimple_bb (insert_after->stmt)->succs);
7973                           basic_block new_bb
7974                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
7975                           gcc_assert (!new_bb);
7976                         }
7977                     }
7978                   else
7979                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
7980                   ctor_seq = NULL;
7981                 }
7982               voprnds.quick_push (vec_cst);
7983               insert_after = NULL;
7984               number_of_places_left_in_vector = nunits;
7985               constant_p = true;
7986               elts.new_vector (vector_type, nunits, 1);
7987               elts.quick_grow (nunits);
7988             }
7989         }
7990     }
7991
7992   /* Since the vectors are created in the reverse order, we should invert
7993      them.  */
7994   vec_num = voprnds.length ();
7995   for (j = vec_num; j != 0; j--)
7996     {
7997       vop = voprnds[j - 1];
7998       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
7999     }
8000
8001   /* In case that VF is greater than the unrolling factor needed for the SLP
8002      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8003      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8004      to replicate the vectors.  */
8005   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8006     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8007          i++)
8008       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8009 }
8010
8011 /* Get the Ith vectorized definition from SLP_NODE.  */
8012
8013 tree
8014 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8015 {
8016   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8017     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8018   else
8019     return SLP_TREE_VEC_DEFS (slp_node)[i];
8020 }
8021
8022 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8023
8024 void
8025 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8026 {
8027   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8028   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8029     {
8030       unsigned j;
8031       gimple *vec_def_stmt;
8032       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8033         vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8034     }
8035   else
8036     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8037 }
8038
8039 /* Get N vectorized definitions for SLP_NODE.  */
8040
8041 void
8042 vect_get_slp_defs (vec_info *,
8043                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8044 {
8045   if (n == -1U)
8046     n = SLP_TREE_CHILDREN (slp_node).length ();
8047
8048   for (unsigned i = 0; i < n; ++i)
8049     {
8050       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8051       vec<tree> vec_defs = vNULL;
8052       vect_get_slp_defs (child, &vec_defs);
8053       vec_oprnds->quick_push (vec_defs);
8054     }
8055 }
8056
8057 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8058    - PERM gives the permutation that the caller wants to use for NODE,
8059      which might be different from SLP_LOAD_PERMUTATION.
8060    - DUMP_P controls whether the function dumps information.  */
8061
8062 static bool
8063 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8064                                 load_permutation_t &perm,
8065                                 const vec<tree> &dr_chain,
8066                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8067                                 bool analyze_only, bool dump_p,
8068                                 unsigned *n_perms, unsigned int *n_loads,
8069                                 bool dce_chain)
8070 {
8071   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8072   int vec_index = 0;
8073   tree vectype = SLP_TREE_VECTYPE (node);
8074   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8075   unsigned int mask_element;
8076   machine_mode mode;
8077
8078   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8079     return false;
8080
8081   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8082
8083   mode = TYPE_MODE (vectype);
8084   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8085
8086   /* Initialize the vect stmts of NODE to properly insert the generated
8087      stmts later.  */
8088   if (! analyze_only)
8089     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
8090          i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
8091       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8092
8093   /* Generate permutation masks for every NODE. Number of masks for each NODE
8094      is equal to GROUP_SIZE.
8095      E.g., we have a group of three nodes with three loads from the same
8096      location in each node, and the vector size is 4. I.e., we have a
8097      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8098      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8099      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8100      ...
8101
8102      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8103      The last mask is illegal since we assume two operands for permute
8104      operation, and the mask element values can't be outside that range.
8105      Hence, the last mask must be converted into {2,5,5,5}.
8106      For the first two permutations we need the first and the second input
8107      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8108      we need the second and the third vectors: {b1,c1,a2,b2} and
8109      {c2,a3,b3,c3}.  */
8110
8111   int vect_stmts_counter = 0;
8112   unsigned int index = 0;
8113   int first_vec_index = -1;
8114   int second_vec_index = -1;
8115   bool noop_p = true;
8116   *n_perms = 0;
8117
8118   vec_perm_builder mask;
8119   unsigned int nelts_to_build;
8120   unsigned int nvectors_per_build;
8121   unsigned int in_nlanes;
8122   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8123                       && multiple_p (nunits, group_size));
8124   if (repeating_p)
8125     {
8126       /* A single vector contains a whole number of copies of the node, so:
8127          (a) all permutes can use the same mask; and
8128          (b) the permutes only need a single vector input.  */
8129       mask.new_vector (nunits, group_size, 3);
8130       nelts_to_build = mask.encoded_nelts ();
8131       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
8132       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8133     }
8134   else
8135     {
8136       /* We need to construct a separate mask for each vector statement.  */
8137       unsigned HOST_WIDE_INT const_nunits, const_vf;
8138       if (!nunits.is_constant (&const_nunits)
8139           || !vf.is_constant (&const_vf))
8140         return false;
8141       mask.new_vector (const_nunits, const_nunits, 1);
8142       nelts_to_build = const_vf * group_size;
8143       nvectors_per_build = 1;
8144       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8145     }
8146   auto_sbitmap used_in_lanes (in_nlanes);
8147   bitmap_clear (used_in_lanes);
8148   auto_bitmap used_defs;
8149
8150   unsigned int count = mask.encoded_nelts ();
8151   mask.quick_grow (count);
8152   vec_perm_indices indices;
8153
8154   for (unsigned int j = 0; j < nelts_to_build; j++)
8155     {
8156       unsigned int iter_num = j / group_size;
8157       unsigned int stmt_num = j % group_size;
8158       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8159       bitmap_set_bit (used_in_lanes, i);
8160       if (repeating_p)
8161         {
8162           first_vec_index = 0;
8163           mask_element = i;
8164         }
8165       else
8166         {
8167           /* Enforced before the loop when !repeating_p.  */
8168           unsigned int const_nunits = nunits.to_constant ();
8169           vec_index = i / const_nunits;
8170           mask_element = i % const_nunits;
8171           if (vec_index == first_vec_index
8172               || first_vec_index == -1)
8173             {
8174               first_vec_index = vec_index;
8175             }
8176           else if (vec_index == second_vec_index
8177                    || second_vec_index == -1)
8178             {
8179               second_vec_index = vec_index;
8180               mask_element += const_nunits;
8181             }
8182           else
8183             {
8184               if (dump_p)
8185                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8186                                  "permutation requires at "
8187                                  "least three vectors %G",
8188                                  stmt_info->stmt);
8189               gcc_assert (analyze_only);
8190               return false;
8191             }
8192
8193           gcc_assert (mask_element < 2 * const_nunits);
8194         }
8195
8196       if (mask_element != index)
8197         noop_p = false;
8198       mask[index++] = mask_element;
8199
8200       if (index == count && !noop_p)
8201         {
8202           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8203           if (!can_vec_perm_const_p (mode, mode, indices))
8204             {
8205               if (dump_p)
8206                 {
8207                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8208                                    vect_location,
8209                                    "unsupported vect permute { ");
8210                   for (i = 0; i < count; ++i)
8211                     {
8212                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8213                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8214                     }
8215                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8216                 }
8217               gcc_assert (analyze_only);
8218               return false;
8219             }
8220
8221           ++*n_perms;
8222         }
8223
8224       if (index == count)
8225         {
8226           if (!analyze_only)
8227             {
8228               tree mask_vec = NULL_TREE;
8229
8230               if (! noop_p)
8231                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8232
8233               if (second_vec_index == -1)
8234                 second_vec_index = first_vec_index;
8235
8236               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8237                 {
8238                   /* Generate the permute statement if necessary.  */
8239                   tree first_vec = dr_chain[first_vec_index + ri];
8240                   tree second_vec = dr_chain[second_vec_index + ri];
8241                   gimple *perm_stmt;
8242                   if (! noop_p)
8243                     {
8244                       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
8245                       tree perm_dest
8246                         = vect_create_destination_var (gimple_assign_lhs (stmt),
8247                                                        vectype);
8248                       perm_dest = make_ssa_name (perm_dest);
8249                       perm_stmt
8250                         = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8251                                                first_vec, second_vec,
8252                                                mask_vec);
8253                       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8254                                                    gsi);
8255                       if (dce_chain)
8256                         {
8257                           bitmap_set_bit (used_defs, first_vec_index + ri);
8258                           bitmap_set_bit (used_defs, second_vec_index + ri);
8259                         }
8260                     }
8261                   else
8262                     {
8263                       /* If mask was NULL_TREE generate the requested
8264                          identity transform.  */
8265                       perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8266                       if (dce_chain)
8267                         bitmap_set_bit (used_defs, first_vec_index + ri);
8268                     }
8269
8270                   /* Store the vector statement in NODE.  */
8271                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8272                 }
8273             }
8274
8275           index = 0;
8276           first_vec_index = -1;
8277           second_vec_index = -1;
8278           noop_p = true;
8279         }
8280     }
8281
8282   if (n_loads)
8283     {
8284       if (repeating_p)
8285         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8286       else
8287         {
8288           /* Enforced above when !repeating_p.  */
8289           unsigned int const_nunits = nunits.to_constant ();
8290           *n_loads = 0;
8291           bool load_seen = false;
8292           for (unsigned i = 0; i < in_nlanes; ++i)
8293             {
8294               if (i % const_nunits == 0)
8295                 {
8296                   if (load_seen)
8297                     *n_loads += 1;
8298                   load_seen = false;
8299                 }
8300               if (bitmap_bit_p (used_in_lanes, i))
8301                 load_seen = true;
8302             }
8303           if (load_seen)
8304             *n_loads += 1;
8305         }
8306     }
8307
8308   if (dce_chain)
8309     for (unsigned i = 0; i < dr_chain.length (); ++i)
8310       if (!bitmap_bit_p (used_defs, i))
8311         {
8312           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8313           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8314           gsi_remove (&rgsi, true);
8315           release_defs (stmt);
8316         }
8317
8318   return true;
8319 }
8320
8321 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8322    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8323    permute statements for the SLP node NODE.  Store the number of vector
8324    permute instructions in *N_PERMS and the number of vector load
8325    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8326    that were not needed.  */
8327
8328 bool
8329 vect_transform_slp_perm_load (vec_info *vinfo,
8330                               slp_tree node, const vec<tree> &dr_chain,
8331                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8332                               bool analyze_only, unsigned *n_perms,
8333                               unsigned int *n_loads, bool dce_chain)
8334 {
8335   return vect_transform_slp_perm_load_1 (vinfo, node,
8336                                          SLP_TREE_LOAD_PERMUTATION (node),
8337                                          dr_chain, gsi, vf, analyze_only,
8338                                          dump_enabled_p (), n_perms, n_loads,
8339                                          dce_chain);
8340 }
8341
8342 /* Produce the next vector result for SLP permutation NODE by adding a vector
8343    statement at GSI.  If MASK_VEC is nonnull, add:
8344
8345       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8346
8347    otherwise add:
8348
8349       <new SSA name> = FIRST_DEF.  */
8350
8351 static void
8352 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8353                           slp_tree node, tree first_def, tree second_def,
8354                           tree mask_vec)
8355 {
8356   tree vectype = SLP_TREE_VECTYPE (node);
8357
8358   /* ???  We SLP match existing vector element extracts but
8359      allow punning which we need to re-instantiate at uses
8360      but have no good way of explicitly representing.  */
8361   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8362       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8363     {
8364       gassign *conv_stmt
8365         = gimple_build_assign (make_ssa_name (vectype),
8366                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8367       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8368       first_def = gimple_assign_lhs (conv_stmt);
8369     }
8370   gassign *perm_stmt;
8371   tree perm_dest = make_ssa_name (vectype);
8372   if (mask_vec)
8373     {
8374       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8375                            TYPE_SIZE (vectype))
8376           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8377         {
8378           gassign *conv_stmt
8379             = gimple_build_assign (make_ssa_name (vectype),
8380                                    build1 (VIEW_CONVERT_EXPR,
8381                                            vectype, second_def));
8382           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8383           second_def = gimple_assign_lhs (conv_stmt);
8384         }
8385       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8386                                        first_def, second_def,
8387                                        mask_vec);
8388     }
8389   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8390     {
8391       /* For identity permutes we still need to handle the case
8392          of lowpart extracts or concats.  */
8393       unsigned HOST_WIDE_INT c;
8394       auto first_def_nunits
8395         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8396       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8397         {
8398           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8399                                  TYPE_SIZE (vectype), bitsize_zero_node);
8400           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8401         }
8402       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8403                                     first_def_nunits, &c) && c == 2)
8404         {
8405           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8406                                             NULL_TREE, second_def);
8407           perm_stmt = gimple_build_assign (perm_dest, ctor);
8408         }
8409       else
8410         gcc_unreachable ();
8411     }
8412   else
8413     {
8414       /* We need a copy here in case the def was external.  */
8415       perm_stmt = gimple_build_assign (perm_dest, first_def);
8416     }
8417   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8418   /* Store the vector statement in NODE.  */
8419   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8420 }
8421
8422 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8423    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8424    If GSI is nonnull, emit the permutation there.
8425
8426    When GSI is null, the only purpose of NODE is to give properties
8427    of the result, such as the vector type and number of SLP lanes.
8428    The node does not need to be a VEC_PERM_EXPR.
8429
8430    If the target supports the operation, return the number of individual
8431    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8432    dump file if DUMP_P is true.  */
8433
8434 static int
8435 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8436                                 slp_tree node, lane_permutation_t &perm,
8437                                 vec<slp_tree> &children, bool dump_p)
8438 {
8439   tree vectype = SLP_TREE_VECTYPE (node);
8440
8441   /* ???  We currently only support all same vector input types
8442      while the SLP IL should really do a concat + select and thus accept
8443      arbitrary mismatches.  */
8444   slp_tree child;
8445   unsigned i;
8446   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8447   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8448   tree op_vectype = NULL_TREE;
8449   FOR_EACH_VEC_ELT (children, i, child)
8450     if (SLP_TREE_VECTYPE (child))
8451       {
8452         op_vectype = SLP_TREE_VECTYPE (child);
8453         break;
8454       }
8455   if (!op_vectype)
8456     op_vectype = vectype;
8457   FOR_EACH_VEC_ELT (children, i, child)
8458     {
8459       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8460            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8461           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8462           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8463         {
8464           if (dump_p)
8465             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8466                              "Unsupported vector types in lane permutation\n");
8467           return -1;
8468         }
8469       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8470         repeating_p = false;
8471     }
8472
8473   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8474   if (dump_p)
8475     {
8476       dump_printf_loc (MSG_NOTE, vect_location,
8477                        "vectorizing permutation");
8478       for (unsigned i = 0; i < perm.length (); ++i)
8479         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8480       if (repeating_p)
8481         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8482       dump_printf (MSG_NOTE, "\n");
8483     }
8484
8485   /* REPEATING_P is true if every output vector is guaranteed to use the
8486      same permute vector.  We can handle that case for both variable-length
8487      and constant-length vectors, but we only handle other cases for
8488      constant-length vectors.
8489
8490      Set:
8491
8492      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8493        mask vector that we want to build.
8494
8495      - NCOPIES to the number of copies of PERM that we need in order
8496        to build the necessary permute mask vectors.
8497
8498      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8499        for each permute mask vector.  This is only relevant when GSI is
8500        nonnull.  */
8501   uint64_t npatterns;
8502   unsigned nelts_per_pattern;
8503   uint64_t ncopies;
8504   unsigned noutputs_per_mask;
8505   if (repeating_p)
8506     {
8507       /* We need a single permute mask vector that has the form:
8508
8509            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8510
8511          In other words, the original n-element permute in PERM is
8512          "unrolled" to fill a full vector.  The stepped vector encoding
8513          that we use for permutes requires 3n elements.  */
8514       npatterns = SLP_TREE_LANES (node);
8515       nelts_per_pattern = ncopies = 3;
8516       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8517     }
8518   else
8519     {
8520       /* Calculate every element of every permute mask vector explicitly,
8521          instead of relying on the pattern described above.  */
8522       if (!nunits.is_constant (&npatterns))
8523         return -1;
8524       nelts_per_pattern = ncopies = 1;
8525       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8526         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8527           return -1;
8528       noutputs_per_mask = 1;
8529     }
8530   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8531   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8532
8533   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8534      from the { SLP operand, scalar lane } permutation as recorded in the
8535      SLP node as intermediate step.  This part should already work
8536      with SLP children with arbitrary number of lanes.  */
8537   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8538   auto_vec<unsigned> active_lane;
8539   vperm.create (olanes);
8540   active_lane.safe_grow_cleared (children.length (), true);
8541   for (unsigned i = 0; i < ncopies; ++i)
8542     {
8543       for (unsigned pi = 0; pi < perm.length (); ++pi)
8544         {
8545           std::pair<unsigned, unsigned> p = perm[pi];
8546           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8547           if (repeating_p)
8548             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8549           else
8550             {
8551               /* We checked above that the vectors are constant-length.  */
8552               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8553               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8554               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8555               vperm.quick_push ({{p.first, vi}, vl});
8556             }
8557         }
8558       /* Advance to the next group.  */
8559       for (unsigned j = 0; j < children.length (); ++j)
8560         active_lane[j] += SLP_TREE_LANES (children[j]);
8561     }
8562
8563   if (dump_p)
8564     {
8565       dump_printf_loc (MSG_NOTE, vect_location,
8566                        "vectorizing permutation");
8567       for (unsigned i = 0; i < perm.length (); ++i)
8568         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8569       if (repeating_p)
8570         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8571       dump_printf (MSG_NOTE, "\n");
8572       dump_printf_loc (MSG_NOTE, vect_location, "as");
8573       for (unsigned i = 0; i < vperm.length (); ++i)
8574         {
8575           if (i != 0
8576               && (repeating_p
8577                   ? multiple_p (i, npatterns)
8578                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8579             dump_printf (MSG_NOTE, ",");
8580           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8581                        vperm[i].first.first, vperm[i].first.second,
8582                        vperm[i].second);
8583         }
8584       dump_printf (MSG_NOTE, "\n");
8585     }
8586
8587   /* We can only handle two-vector permutes, everything else should
8588      be lowered on the SLP level.  The following is closely inspired
8589      by vect_transform_slp_perm_load and is supposed to eventually
8590      replace it.
8591      ???   As intermediate step do code-gen in the SLP tree representation
8592      somehow?  */
8593   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8594   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8595   unsigned int index = 0;
8596   poly_uint64 mask_element;
8597   vec_perm_builder mask;
8598   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8599   unsigned int count = mask.encoded_nelts ();
8600   mask.quick_grow (count);
8601   vec_perm_indices indices;
8602   unsigned nperms = 0;
8603   for (unsigned i = 0; i < vperm.length (); ++i)
8604     {
8605       mask_element = vperm[i].second;
8606       if (first_vec.first == -1U
8607           || first_vec == vperm[i].first)
8608         first_vec = vperm[i].first;
8609       else if (second_vec.first == -1U
8610                || second_vec == vperm[i].first)
8611         {
8612           second_vec = vperm[i].first;
8613           mask_element += nunits;
8614         }
8615       else
8616         {
8617           if (dump_p)
8618             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8619                              "permutation requires at "
8620                              "least three vectors\n");
8621           gcc_assert (!gsi);
8622           return -1;
8623         }
8624
8625       mask[index++] = mask_element;
8626
8627       if (index == count)
8628         {
8629           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8630                               TYPE_VECTOR_SUBPARTS (op_vectype));
8631           bool identity_p = indices.series_p (0, 1, 0, 1);
8632           machine_mode vmode = TYPE_MODE (vectype);
8633           machine_mode op_vmode = TYPE_MODE (op_vectype);
8634           unsigned HOST_WIDE_INT c;
8635           if ((!identity_p
8636                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8637               || (identity_p
8638                   && !known_le (nunits,
8639                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8640                   && (!constant_multiple_p (nunits,
8641                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8642                                             &c) || c != 2)))
8643             {
8644               if (dump_p)
8645                 {
8646                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8647                                    vect_location,
8648                                    "unsupported vect permute { ");
8649                   for (i = 0; i < count; ++i)
8650                     {
8651                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8652                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8653                     }
8654                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8655                 }
8656               gcc_assert (!gsi);
8657               return -1;
8658             }
8659
8660           if (!identity_p)
8661             nperms++;
8662           if (gsi)
8663             {
8664               if (second_vec.first == -1U)
8665                 second_vec = first_vec;
8666
8667               slp_tree
8668                 first_node = children[first_vec.first],
8669                 second_node = children[second_vec.first];
8670
8671               tree mask_vec = NULL_TREE;
8672               if (!identity_p)
8673                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8674
8675               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8676                 {
8677                   tree first_def
8678                     = vect_get_slp_vect_def (first_node,
8679                                              first_vec.second + vi);
8680                   tree second_def
8681                     = vect_get_slp_vect_def (second_node,
8682                                              second_vec.second + vi);
8683                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8684                                             second_def, mask_vec);
8685                 }
8686             }
8687
8688           index = 0;
8689           first_vec = std::make_pair (-1U, -1U);
8690           second_vec = std::make_pair (-1U, -1U);
8691         }
8692     }
8693
8694   return nperms;
8695 }
8696
8697 /* Vectorize the SLP permutations in NODE as specified
8698    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8699    child number and lane number.
8700    Interleaving of two two-lane two-child SLP subtrees (not supported):
8701      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8702    A blend of two four-lane two-child SLP subtrees:
8703      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8704    Highpart of a four-lane one-child SLP subtree (not supported):
8705      [ { 0, 2 }, { 0, 3 } ]
8706    Where currently only a subset is supported by code generating below.  */
8707
8708 static bool
8709 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8710                               slp_tree node, stmt_vector_for_cost *cost_vec)
8711 {
8712   tree vectype = SLP_TREE_VECTYPE (node);
8713   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8714   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8715                                                SLP_TREE_CHILDREN (node),
8716                                                dump_enabled_p ());
8717   if (nperms < 0)
8718     return false;
8719
8720   if (!gsi)
8721     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8722
8723   return true;
8724 }
8725
8726 /* Vectorize SLP NODE.  */
8727
8728 static void
8729 vect_schedule_slp_node (vec_info *vinfo,
8730                         slp_tree node, slp_instance instance)
8731 {
8732   gimple_stmt_iterator si;
8733   int i;
8734   slp_tree child;
8735
8736   /* For existing vectors there's nothing to do.  */
8737   if (SLP_TREE_VEC_DEFS (node).exists ())
8738     return;
8739
8740   gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8741
8742   /* Vectorize externals and constants.  */
8743   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8744       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8745     {
8746       /* ???  vectorizable_shift can end up using a scalar operand which is
8747          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8748          node in this case.  */
8749       if (!SLP_TREE_VECTYPE (node))
8750         return;
8751
8752       vect_create_constant_vectors (vinfo, node);
8753       return;
8754     }
8755
8756   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8757
8758   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8759   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8760
8761   if (dump_enabled_p ())
8762     dump_printf_loc (MSG_NOTE, vect_location,
8763                      "------>vectorizing SLP node starting from: %G",
8764                      stmt_info->stmt);
8765
8766   if (STMT_VINFO_DATA_REF (stmt_info)
8767       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8768     {
8769       /* Vectorized loads go before the first scalar load to make it
8770          ready early, vectorized stores go before the last scalar
8771          stmt which is where all uses are ready.  */
8772       stmt_vec_info last_stmt_info = NULL;
8773       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8774         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8775       else /* DR_IS_WRITE */
8776         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8777       si = gsi_for_stmt (last_stmt_info->stmt);
8778     }
8779   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8780             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8781             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8782            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8783     {
8784       /* For PHI node vectorization we do not use the insertion iterator.  */
8785       si = gsi_none ();
8786     }
8787   else
8788     {
8789       /* Emit other stmts after the children vectorized defs which is
8790          earliest possible.  */
8791       gimple *last_stmt = NULL;
8792       bool seen_vector_def = false;
8793       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8794         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8795           {
8796             /* For fold-left reductions we are retaining the scalar
8797                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8798                set so the representation isn't perfect.  Resort to the
8799                last scalar def here.  */
8800             if (SLP_TREE_VEC_STMTS (child).is_empty ())
8801               {
8802                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8803                             == cycle_phi_info_type);
8804                 gphi *phi = as_a <gphi *>
8805                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8806                 if (!last_stmt
8807                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8808                   last_stmt = phi;
8809               }
8810             /* We are emitting all vectorized stmts in the same place and
8811                the last one is the last.
8812                ???  Unless we have a load permutation applied and that
8813                figures to re-use an earlier generated load.  */
8814             unsigned j;
8815             gimple *vstmt;
8816             FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8817               if (!last_stmt
8818                   || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8819                 last_stmt = vstmt;
8820           }
8821         else if (!SLP_TREE_VECTYPE (child))
8822           {
8823             /* For externals we use unvectorized at all scalar defs.  */
8824             unsigned j;
8825             tree def;
8826             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8827               if (TREE_CODE (def) == SSA_NAME
8828                   && !SSA_NAME_IS_DEFAULT_DEF (def))
8829                 {
8830                   gimple *stmt = SSA_NAME_DEF_STMT (def);
8831                   if (!last_stmt
8832                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8833                     last_stmt = stmt;
8834                 }
8835           }
8836         else
8837           {
8838             /* For externals we have to look at all defs since their
8839                insertion place is decided per vector.  But beware
8840                of pre-existing vectors where we need to make sure
8841                we do not insert before the region boundary.  */
8842             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8843                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8844               seen_vector_def = true;
8845             else
8846               {
8847                 unsigned j;
8848                 tree vdef;
8849                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8850                   if (TREE_CODE (vdef) == SSA_NAME
8851                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8852                     {
8853                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8854                       if (!last_stmt
8855                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8856                         last_stmt = vstmt;
8857                     }
8858               }
8859           }
8860       /* This can happen when all children are pre-existing vectors or
8861          constants.  */
8862       if (!last_stmt)
8863         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8864       if (!last_stmt)
8865         {
8866           gcc_assert (seen_vector_def);
8867           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8868         }
8869       else if (is_ctrl_altering_stmt (last_stmt))
8870         {
8871           /* We split regions to vectorize at control altering stmts
8872              with a definition so this must be an external which
8873              we can insert at the start of the region.  */
8874           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8875         }
8876       else if (is_a <bb_vec_info> (vinfo)
8877                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8878                && gimple_could_trap_p (stmt_info->stmt))
8879         {
8880           /* We've constrained possibly trapping operations to all come
8881              from the same basic-block, if vectorized defs would allow earlier
8882              scheduling still force vectorized stmts to the original block.
8883              This is only necessary for BB vectorization since for loop vect
8884              all operations are in a single BB and scalar stmt based
8885              placement doesn't play well with epilogue vectorization.  */
8886           gcc_assert (dominated_by_p (CDI_DOMINATORS,
8887                                       gimple_bb (stmt_info->stmt),
8888                                       gimple_bb (last_stmt)));
8889           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8890         }
8891       else if (is_a <gphi *> (last_stmt))
8892         si = gsi_after_labels (gimple_bb (last_stmt));
8893       else
8894         {
8895           si = gsi_for_stmt (last_stmt);
8896           gsi_next (&si);
8897         }
8898     }
8899
8900   bool done_p = false;
8901
8902   /* Handle purely internal nodes.  */
8903   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8904     {
8905       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
8906          be shared with different SLP nodes (but usually it's the same
8907          operation apart from the case the stmt is only there for denoting
8908          the actual scalar lane defs ...).  So do not call vect_transform_stmt
8909          but open-code it here (partly).  */
8910       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8911       gcc_assert (done);
8912       done_p = true;
8913     }
8914   if (!done_p)
8915     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8916 }
8917
8918 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8919    For loop vectorization this is done in vectorizable_call, but for SLP
8920    it needs to be deferred until end of vect_schedule_slp, because multiple
8921    SLP instances may refer to the same scalar stmt.  */
8922
8923 static void
8924 vect_remove_slp_scalar_calls (vec_info *vinfo,
8925                               slp_tree node, hash_set<slp_tree> &visited)
8926 {
8927   gimple *new_stmt;
8928   gimple_stmt_iterator gsi;
8929   int i;
8930   slp_tree child;
8931   tree lhs;
8932   stmt_vec_info stmt_info;
8933
8934   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8935     return;
8936
8937   if (visited.add (node))
8938     return;
8939
8940   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8941     vect_remove_slp_scalar_calls (vinfo, child, visited);
8942
8943   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8944     {
8945       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8946       if (!stmt || gimple_bb (stmt) == NULL)
8947         continue;
8948       if (is_pattern_stmt_p (stmt_info)
8949           || !PURE_SLP_STMT (stmt_info))
8950         continue;
8951       lhs = gimple_call_lhs (stmt);
8952       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
8953       gsi = gsi_for_stmt (stmt);
8954       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
8955       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
8956     }
8957 }
8958
8959 static void
8960 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
8961 {
8962   hash_set<slp_tree> visited;
8963   vect_remove_slp_scalar_calls (vinfo, node, visited);
8964 }
8965
8966 /* Vectorize the instance root.  */
8967
8968 void
8969 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
8970 {
8971   gassign *rstmt = NULL;
8972
8973   if (instance->kind == slp_inst_kind_ctor)
8974     {
8975       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
8976         {
8977           gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
8978           tree vect_lhs = gimple_get_lhs (child_stmt);
8979           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
8980           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
8981                                           TREE_TYPE (vect_lhs)))
8982             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
8983                                vect_lhs);
8984           rstmt = gimple_build_assign (root_lhs, vect_lhs);
8985         }
8986       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
8987         {
8988           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8989           gimple *child_stmt;
8990           int j;
8991           vec<constructor_elt, va_gc> *v;
8992           vec_alloc (v, nelts);
8993
8994           /* A CTOR can handle V16HI composition from VNx8HI so we
8995              do not need to convert vector elements if the types
8996              do not match.  */
8997           FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
8998             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8999                                     gimple_get_lhs (child_stmt));
9000           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9001           tree rtype
9002             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9003           tree r_constructor = build_constructor (rtype, v);
9004           rstmt = gimple_build_assign (lhs, r_constructor);
9005         }
9006     }
9007   else if (instance->kind == slp_inst_kind_bb_reduc)
9008     {
9009       /* Largely inspired by reduction chain epilogue handling in
9010          vect_create_epilog_for_reduction.  */
9011       vec<tree> vec_defs = vNULL;
9012       vect_get_slp_defs (node, &vec_defs);
9013       enum tree_code reduc_code
9014         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9015       /* ???  We actually have to reflect signs somewhere.  */
9016       if (reduc_code == MINUS_EXPR)
9017         reduc_code = PLUS_EXPR;
9018       gimple_seq epilogue = NULL;
9019       /* We may end up with more than one vector result, reduce them
9020          to one vector.  */
9021       tree vec_def = vec_defs[0];
9022       for (unsigned i = 1; i < vec_defs.length (); ++i)
9023         vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9024                                 vec_def, vec_defs[i]);
9025       vec_defs.release ();
9026       /* ???  Support other schemes than direct internal fn.  */
9027       internal_fn reduc_fn;
9028       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9029           || reduc_fn == IFN_LAST)
9030         gcc_unreachable ();
9031       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9032                                       TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9033
9034       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9035       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9036       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9037       update_stmt (gsi_stmt (rgsi));
9038       return;
9039     }
9040   else
9041     gcc_unreachable ();
9042
9043   gcc_assert (rstmt);
9044
9045   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9046   gsi_replace (&rgsi, rstmt, true);
9047 }
9048
9049 struct slp_scc_info
9050 {
9051   bool on_stack;
9052   int dfs;
9053   int lowlink;
9054 };
9055
9056 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9057
9058 static void
9059 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9060                    hash_map<slp_tree, slp_scc_info> &scc_info,
9061                    int &maxdfs, vec<slp_tree> &stack)
9062 {
9063   bool existed_p;
9064   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9065   gcc_assert (!existed_p);
9066   info->dfs = maxdfs;
9067   info->lowlink = maxdfs;
9068   maxdfs++;
9069
9070   /* Leaf.  */
9071   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9072     {
9073       info->on_stack = false;
9074       vect_schedule_slp_node (vinfo, node, instance);
9075       return;
9076     }
9077
9078   info->on_stack = true;
9079   stack.safe_push (node);
9080
9081   unsigned i;
9082   slp_tree child;
9083   /* DFS recurse.  */
9084   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9085     {
9086       if (!child)
9087         continue;
9088       slp_scc_info *child_info = scc_info.get (child);
9089       if (!child_info)
9090         {
9091           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9092           /* Recursion might have re-allocated the node.  */
9093           info = scc_info.get (node);
9094           child_info = scc_info.get (child);
9095           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9096         }
9097       else if (child_info->on_stack)
9098         info->lowlink = MIN (info->lowlink, child_info->dfs);
9099     }
9100   if (info->lowlink != info->dfs)
9101     return;
9102
9103   auto_vec<slp_tree, 4> phis_to_fixup;
9104
9105   /* Singleton.  */
9106   if (stack.last () == node)
9107     {
9108       stack.pop ();
9109       info->on_stack = false;
9110       vect_schedule_slp_node (vinfo, node, instance);
9111       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9112           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9113         phis_to_fixup.quick_push (node);
9114     }
9115   else
9116     {
9117       /* SCC.  */
9118       int last_idx = stack.length () - 1;
9119       while (stack[last_idx] != node)
9120         last_idx--;
9121       /* We can break the cycle at PHIs who have at least one child
9122          code generated.  Then we could re-start the DFS walk until
9123          all nodes in the SCC are covered (we might have new entries
9124          for only back-reachable nodes).  But it's simpler to just
9125          iterate and schedule those that are ready.  */
9126       unsigned todo = stack.length () - last_idx;
9127       do
9128         {
9129           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9130             {
9131               slp_tree entry = stack[idx];
9132               if (!entry)
9133                 continue;
9134               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9135                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9136               bool ready = !phi;
9137               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9138                   if (!child)
9139                     {
9140                       gcc_assert (phi);
9141                       ready = true;
9142                       break;
9143                     }
9144                   else if (scc_info.get (child)->on_stack)
9145                     {
9146                       if (!phi)
9147                         {
9148                           ready = false;
9149                           break;
9150                         }
9151                     }
9152                   else
9153                     {
9154                       if (phi)
9155                         {
9156                           ready = true;
9157                           break;
9158                         }
9159                     }
9160               if (ready)
9161                 {
9162                   vect_schedule_slp_node (vinfo, entry, instance);
9163                   scc_info.get (entry)->on_stack = false;
9164                   stack[idx] = NULL;
9165                   todo--;
9166                   if (phi)
9167                     phis_to_fixup.safe_push (entry);
9168                 }
9169             }
9170         }
9171       while (todo != 0);
9172
9173       /* Pop the SCC.  */
9174       stack.truncate (last_idx);
9175     }
9176
9177   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9178   slp_tree phi_node;
9179   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9180     {
9181       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9182       edge_iterator ei;
9183       edge e;
9184       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9185         {
9186           unsigned dest_idx = e->dest_idx;
9187           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9188           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9189             continue;
9190           /* Simply fill all args.  */
9191           for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
9192             add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9193                          vect_get_slp_vect_def (child, i),
9194                          e, gimple_phi_arg_location (phi, dest_idx));
9195         }
9196     }
9197 }
9198
9199 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9200
9201 void
9202 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9203 {
9204   slp_instance instance;
9205   unsigned int i;
9206
9207   hash_map<slp_tree, slp_scc_info> scc_info;
9208   int maxdfs = 0;
9209   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9210     {
9211       slp_tree node = SLP_INSTANCE_TREE (instance);
9212       if (dump_enabled_p ())
9213         {
9214           dump_printf_loc (MSG_NOTE, vect_location,
9215                            "Vectorizing SLP tree:\n");
9216           /* ???  Dump all?  */
9217           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9218             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9219                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9220           vect_print_slp_graph (MSG_NOTE, vect_location,
9221                                 SLP_INSTANCE_TREE (instance));
9222         }
9223       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9224          have a PHI be the node breaking the cycle.  */
9225       auto_vec<slp_tree> stack;
9226       if (!scc_info.get (node))
9227         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9228
9229       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9230         vectorize_slp_instance_root_stmt (node, instance);
9231
9232       if (dump_enabled_p ())
9233         dump_printf_loc (MSG_NOTE, vect_location,
9234                          "vectorizing stmts using SLP.\n");
9235     }
9236
9237   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9238     {
9239       slp_tree root = SLP_INSTANCE_TREE (instance);
9240       stmt_vec_info store_info;
9241       unsigned int j;
9242
9243       /* Remove scalar call stmts.  Do not do this for basic-block
9244          vectorization as not all uses may be vectorized.
9245          ???  Why should this be necessary?  DCE should be able to
9246          remove the stmts itself.
9247          ???  For BB vectorization we can as well remove scalar
9248          stmts starting from the SLP tree root if they have no
9249          uses.  */
9250       if (is_a <loop_vec_info> (vinfo))
9251         vect_remove_slp_scalar_calls (vinfo, root);
9252
9253       /* Remove vectorized stores original scalar stmts.  */
9254       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9255         {
9256           if (!STMT_VINFO_DATA_REF (store_info)
9257               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9258             break;
9259
9260           store_info = vect_orig_stmt (store_info);
9261           /* Free the attached stmt_vec_info and remove the stmt.  */
9262           vinfo->remove_stmt (store_info);
9263
9264           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9265              to not crash in vect_free_slp_tree later.  */
9266           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9267             SLP_TREE_REPRESENTATIVE (root) = NULL;
9268         }
9269     }
9270 }