gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_STMTS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_STMTS (this).release ();
 145   SLP_TREE_VEC_DEFS (this).release ();
 146   SLP_TREE_LOAD_PERMUTATION (this).release ();
 147   SLP_TREE_LANE_PERMUTATION (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 153
 154 void
 155 vect_free_slp_tree (slp_tree node)
 156 {
 157   int i;
 158   slp_tree child;
 159
 160   if (--SLP_TREE_REF_COUNT (node) != 0)
 161     return;
 162
 163   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 164     if (child)
 165       vect_free_slp_tree (child);
 166
 167   /* If the node defines any SLP only patterns then those patterns are no
 168      longer valid and should be removed.  */
 169   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 170   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 171     {
 172       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 173       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 174       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 175     }
 176
 177   delete node;
 178 }
 179
 180 /* Return a location suitable for dumpings related to the SLP instance.  */
 181
 182 dump_user_location_t
 183 _slp_instance::location () const
 184 {
 185   if (!root_stmts.is_empty ())
 186     return root_stmts[0]->stmt;
 187   else
 188     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 189 }
 190
 191
 192 /* Free the memory allocated for the SLP instance.  */
 193
 194 void
 195 vect_free_slp_instance (slp_instance instance)
 196 {
 197   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 198   SLP_INSTANCE_LOADS (instance).release ();
 199   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 200   instance->subgraph_entries.release ();
 201   instance->cost_vec.release ();
 202   free (instance);
 203 }
 204
 205
 206 /* Create an SLP node for SCALAR_STMTS.  */
 207
 208 slp_tree
 209 vect_create_new_slp_node (unsigned nops, tree_code code)
 210 {
 211   slp_tree node = new _slp_tree;
 212   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 213   SLP_TREE_CHILDREN (node).create (nops);
 214   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 215   SLP_TREE_CODE (node) = code;
 216   return node;
 217 }
 218 /* Create an SLP node for SCALAR_STMTS.  */
 219
 220 static slp_tree
 221 vect_create_new_slp_node (slp_tree node,
 222                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 223 {
 224   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 225   SLP_TREE_CHILDREN (node).create (nops);
 226   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 227   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 228   SLP_TREE_LANES (node) = scalar_stmts.length ();
 229   return node;
 230 }
 231
 232 /* Create an SLP node for SCALAR_STMTS.  */
 233
 234 static slp_tree
 235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 238 }
 239
 240 /* Create an SLP node for OPS.  */
 241
 242 static slp_tree
 243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 244 {
 245   SLP_TREE_SCALAR_OPS (node) = ops;
 246   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 247   SLP_TREE_LANES (node) = ops.length ();
 248   return node;
 249 }
 250
 251 /* Create an SLP node for OPS.  */
 252
 253 static slp_tree
 254 vect_create_new_slp_node (vec<tree> ops)
 255 {
 256   return vect_create_new_slp_node (new _slp_tree, ops);
 257 }
 258
 259
 260 /* This structure is used in creation of an SLP tree.  Each instance
 261    corresponds to the same operand in a group of scalar stmts in an SLP
 262    node.  */
 263 typedef struct _slp_oprnd_info
 264 {
 265   /* Def-stmts for the operands.  */
 266   vec<stmt_vec_info> def_stmts;
 267   /* Operands.  */
 268   vec<tree> ops;
 269   /* Information about the first statement, its vector def-type, type, the
 270      operand itself in case it's constant, and an indication if it's a pattern
 271      stmt.  */
 272   tree first_op_type;
 273   enum vect_def_type first_dt;
 274   bool any_pattern;
 275 } *slp_oprnd_info;
 276
 277
 278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 279    operand.  */
 280 static vec<slp_oprnd_info>
 281 vect_create_oprnd_info (int nops, int group_size)
 282 {
 283   int i;
 284   slp_oprnd_info oprnd_info;
 285   vec<slp_oprnd_info> oprnds_info;
 286
 287   oprnds_info.create (nops);
 288   for (i = 0; i < nops; i++)
 289     {
 290       oprnd_info = XNEW (struct _slp_oprnd_info);
 291       oprnd_info->def_stmts.create (group_size);
 292       oprnd_info->ops.create (group_size);
 293       oprnd_info->first_dt = vect_uninitialized_def;
 294       oprnd_info->first_op_type = NULL_TREE;
 295       oprnd_info->any_pattern = false;
 296       oprnds_info.quick_push (oprnd_info);
 297     }
 298
 299   return oprnds_info;
 300 }
 301
 302
 303 /* Free operands info.  */
 304
 305 static void
 306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 307 {
 308   int i;
 309   slp_oprnd_info oprnd_info;
 310
 311   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 312     {
 313       oprnd_info->def_stmts.release ();
 314       oprnd_info->ops.release ();
 315       XDELETE (oprnd_info);
 316     }
 317
 318   oprnds_info.release ();
 319 }
 320
 321 /* Return the execution frequency of NODE (so that a higher value indicates
 322    a "more important" node when optimizing for speed).  */
 323
 324 static sreal
 325 vect_slp_node_weight (slp_tree node)
 326 {
 327   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 328   basic_block bb = gimple_bb (stmt_info->stmt);
 329   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 330 }
 331
 332 /* Return true if STMTS contains a pattern statement.  */
 333
 334 static bool
 335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 336 {
 337   stmt_vec_info stmt_info;
 338   unsigned int i;
 339   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 340     if (is_pattern_stmt_p (stmt_info))
 341       return true;
 342   return false;
 343 }
 344
 345 /* Return true when all lanes in the external or constant NODE have
 346    the same value.  */
 347
 348 static bool
 349 vect_slp_tree_uniform_p (slp_tree node)
 350 {
 351   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 352               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 353
 354   /* Pre-exsting vectors.  */
 355   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 356     return false;
 357
 358   unsigned i;
 359   tree op, first = NULL_TREE;
 360   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 361     if (!first)
 362       first = op;
 363     else if (!operand_equal_p (first, op, 0))
 364       return false;
 365
 366   return true;
 367 }
 368
 369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 370    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 371    of the chain.  */
 372
 373 int
 374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 375                                       stmt_vec_info first_stmt_info)
 376 {
 377   stmt_vec_info next_stmt_info = first_stmt_info;
 378   int result = 0;
 379
 380   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 381     return -1;
 382
 383   do
 384     {
 385       if (next_stmt_info == stmt_info)
 386         return result;
 387       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 388       if (next_stmt_info)
 389         result += DR_GROUP_GAP (next_stmt_info);
 390     }
 391   while (next_stmt_info);
 392
 393   return -1;
 394 }
 395
 396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 397    using the method implemented by duplicate_and_interleave.  Return true
 398    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 399    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 400    (if nonnull).  */
 401
 402 bool
 403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 404                                 tree elt_type, unsigned int *nvectors_out,
 405                                 tree *vector_type_out,
 406                                 tree *permutes)
 407 {
 408   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 409   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 410     return false;
 411
 412   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 413   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 414   unsigned int nvectors = 1;
 415   for (;;)
 416     {
 417       scalar_int_mode int_mode;
 418       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 419       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 420         {
 421           /* Get the natural vector type for this SLP group size.  */
 422           tree int_type = build_nonstandard_integer_type
 423             (GET_MODE_BITSIZE (int_mode), 1);
 424           tree vector_type
 425             = get_vectype_for_scalar_type (vinfo, int_type, count);
 426           if (vector_type
 427               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 428               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 429                            GET_MODE_SIZE (base_vector_mode)))
 430             {
 431               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 432                  together into elements of type INT_TYPE and using the result
 433                  to build NVECTORS vectors.  */
 434               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 435               vec_perm_builder sel1 (nelts, 2, 3);
 436               vec_perm_builder sel2 (nelts, 2, 3);
 437               poly_int64 half_nelts = exact_div (nelts, 2);
 438               for (unsigned int i = 0; i < 3; ++i)
 439                 {
 440                   sel1.quick_push (i);
 441                   sel1.quick_push (i + nelts);
 442                   sel2.quick_push (half_nelts + i);
 443                   sel2.quick_push (half_nelts + i + nelts);
 444                 }
 445               vec_perm_indices indices1 (sel1, 2, nelts);
 446               vec_perm_indices indices2 (sel2, 2, nelts);
 447               machine_mode vmode = TYPE_MODE (vector_type);
 448               if (can_vec_perm_const_p (vmode, vmode, indices1)
 449                   && can_vec_perm_const_p (vmode, vmode, indices2))
 450                 {
 451                   if (nvectors_out)
 452                     *nvectors_out = nvectors;
 453                   if (vector_type_out)
 454                     *vector_type_out = vector_type;
 455                   if (permutes)
 456                     {
 457                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 458                                                                 indices1);
 459                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 460                                                                 indices2);
 461                     }
 462                   return true;
 463                 }
 464             }
 465         }
 466       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 467         return false;
 468       nvectors *= 2;
 469     }
 470 }
 471
 472 /* Return true if DTA and DTB match.  */
 473
 474 static bool
 475 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 476 {
 477   return (dta == dtb
 478           || ((dta == vect_external_def || dta == vect_constant_def)
 479               && (dtb == vect_external_def || dtb == vect_constant_def)));
 480 }
 481
 482 static const int cond_expr_maps[3][5] = {
 483   { 4, -1, -2, 1, 2 },
 484   { 4, -2, -1, 1, 2 },
 485   { 4, -1, -2, 2, 1 }
 486 };
 487 static const int arg1_map[] = { 1, 1 };
 488 static const int arg2_map[] = { 1, 2 };
 489 static const int arg1_arg4_map[] = { 2, 1, 4 };
 490 static const int op1_op0_map[] = { 2, 1, 0 };
 491
 492 /* For most SLP statements, there is a one-to-one mapping between
 493    gimple arguments and child nodes.  If that is not true for STMT,
 494    return an array that contains:
 495
 496    - the number of child nodes, followed by
 497    - for each child node, the index of the argument associated with that node.
 498      The special index -1 is the first operand of an embedded comparison and
 499      the special index -2 is the second operand of an embedded comparison.
 500
 501    SWAP is as for vect_get_and_check_slp_defs.  */
 502
 503 static const int *
 504 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 505 {
 506   if (auto assign = dyn_cast<const gassign *> (stmt))
 507     {
 508       if (gimple_assign_rhs_code (assign) == COND_EXPR
 509           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 510         return cond_expr_maps[swap];
 511       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 512           && swap)
 513         return op1_op0_map;
 514     }
 515   gcc_assert (!swap);
 516   if (auto call = dyn_cast<const gcall *> (stmt))
 517     {
 518       if (gimple_call_internal_p (call))
 519         switch (gimple_call_internal_fn (call))
 520           {
 521           case IFN_MASK_LOAD:
 522             return arg2_map;
 523
 524           case IFN_GATHER_LOAD:
 525             return arg1_map;
 526
 527           case IFN_MASK_GATHER_LOAD:
 528             return arg1_arg4_map;
 529
 530           default:
 531             break;
 532           }
 533     }
 534   return nullptr;
 535 }
 536
 537 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 538    they are of a valid type and that they match the defs of the first stmt of
 539    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 540    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 541    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 542    is 1 if STMT is cond and operands of comparison need to be swapped;
 543    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 544
 545    If there was a fatal error return -1; if the error could be corrected by
 546    swapping operands of father node of this one, return 1; if everything is
 547    ok return 0.  */
 548 static int
 549 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 550                              bool *skip_args,
 551                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 552                              vec<slp_oprnd_info> *oprnds_info)
 553 {
 554   stmt_vec_info stmt_info = stmts[stmt_num];
 555   tree oprnd;
 556   unsigned int i, number_of_oprnds;
 557   enum vect_def_type dt = vect_uninitialized_def;
 558   slp_oprnd_info oprnd_info;
 559   unsigned int commutative_op = -1U;
 560   bool first = stmt_num == 0;
 561
 562   if (!is_a<gcall *> (stmt_info->stmt)
 563       && !is_a<gassign *> (stmt_info->stmt)
 564       && !is_a<gphi *> (stmt_info->stmt))
 565     return -1;
 566
 567   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 568   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 569   if (map)
 570     number_of_oprnds = *map++;
 571   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 572     {
 573       if (gimple_call_internal_p (stmt))
 574         {
 575           internal_fn ifn = gimple_call_internal_fn (stmt);
 576           commutative_op = first_commutative_argument (ifn);
 577         }
 578     }
 579   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 580     {
 581       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 582         commutative_op = 0;
 583     }
 584
 585   bool swapped = (swap != 0);
 586   bool backedge = false;
 587   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 588   for (i = 0; i < number_of_oprnds; i++)
 589     {
 590       int opno = map ? map[i] : int (i);
 591       if (opno < 0)
 592         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 593       else
 594         {
 595           oprnd = gimple_arg (stmt_info->stmt, opno);
 596           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 597             backedge = dominated_by_p (CDI_DOMINATORS,
 598                                        gimple_phi_arg_edge (stmt, opno)->src,
 599                                        gimple_bb (stmt_info->stmt));
 600         }
 601       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 602         oprnd = TREE_OPERAND (oprnd, 0);
 603
 604       oprnd_info = (*oprnds_info)[i];
 605
 606       stmt_vec_info def_stmt_info;
 607       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 608         {
 609           if (dump_enabled_p ())
 610             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 611                              "Build SLP failed: can't analyze def for %T\n",
 612                              oprnd);
 613
 614           return -1;
 615         }
 616
 617       if (skip_args[i])
 618         {
 619           oprnd_info->def_stmts.quick_push (NULL);
 620           oprnd_info->ops.quick_push (NULL_TREE);
 621           oprnd_info->first_dt = vect_uninitialized_def;
 622           continue;
 623         }
 624
 625       oprnd_info->def_stmts.quick_push (def_stmt_info);
 626       oprnd_info->ops.quick_push (oprnd);
 627
 628       if (def_stmt_info
 629           && is_pattern_stmt_p (def_stmt_info))
 630         {
 631           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 632               != def_stmt_info)
 633             oprnd_info->any_pattern = true;
 634           else
 635             /* If we promote this to external use the original stmt def.  */
 636             oprnd_info->ops.last ()
 637               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 638         }
 639
 640       /* If there's a extern def on a backedge make sure we can
 641          code-generate at the region start.
 642          ???  This is another case that could be fixed by adjusting
 643          how we split the function but at the moment we'd have conflicting
 644          goals there.  */
 645       if (backedge
 646           && dts[i] == vect_external_def
 647           && is_a <bb_vec_info> (vinfo)
 648           && TREE_CODE (oprnd) == SSA_NAME
 649           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 650           && !dominated_by_p (CDI_DOMINATORS,
 651                               as_a <bb_vec_info> (vinfo)->bbs[0],
 652                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 653         {
 654           if (dump_enabled_p ())
 655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 656                              "Build SLP failed: extern def %T only defined "
 657                              "on backedge\n", oprnd);
 658           return -1;
 659         }
 660
 661       if (first)
 662         {
 663           tree type = TREE_TYPE (oprnd);
 664           dt = dts[i];
 665           if ((dt == vect_constant_def
 666                || dt == vect_external_def)
 667               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 668               && (TREE_CODE (type) == BOOLEAN_TYPE
 669                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 670                                                       type)))
 671             {
 672               if (dump_enabled_p ())
 673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 674                                  "Build SLP failed: invalid type of def "
 675                                  "for variable-length SLP %T\n", oprnd);
 676               return -1;
 677             }
 678
 679           /* For the swapping logic below force vect_reduction_def
 680              for the reduction op in a SLP reduction group.  */
 681           if (!STMT_VINFO_DATA_REF (stmt_info)
 682               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 683               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 684               && def_stmt_info)
 685             dts[i] = dt = vect_reduction_def;
 686
 687           /* Check the types of the definition.  */
 688           switch (dt)
 689             {
 690             case vect_external_def:
 691             case vect_constant_def:
 692             case vect_internal_def:
 693             case vect_reduction_def:
 694             case vect_induction_def:
 695             case vect_nested_cycle:
 696             case vect_first_order_recurrence:
 697               break;
 698
 699             default:
 700               /* FORNOW: Not supported.  */
 701               if (dump_enabled_p ())
 702                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 703                                  "Build SLP failed: illegal type of def %T\n",
 704                                  oprnd);
 705               return -1;
 706             }
 707
 708           oprnd_info->first_dt = dt;
 709           oprnd_info->first_op_type = type;
 710         }
 711     }
 712   if (first)
 713     return 0;
 714
 715   /* Now match the operand definition types to that of the first stmt.  */
 716   for (i = 0; i < number_of_oprnds;)
 717     {
 718       if (skip_args[i])
 719         {
 720           ++i;
 721           continue;
 722         }
 723
 724       oprnd_info = (*oprnds_info)[i];
 725       dt = dts[i];
 726       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 727       oprnd = oprnd_info->ops[stmt_num];
 728       tree type = TREE_TYPE (oprnd);
 729
 730       if (!types_compatible_p (oprnd_info->first_op_type, type))
 731         {
 732           if (dump_enabled_p ())
 733             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 734                              "Build SLP failed: different operand types\n");
 735           return 1;
 736         }
 737
 738       /* Not first stmt of the group, check that the def-stmt/s match
 739          the def-stmt/s of the first stmt.  Allow different definition
 740          types for reduction chains: the first stmt must be a
 741          vect_reduction_def (a phi node), and the rest
 742          end in the reduction chain.  */
 743       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 744            && !(oprnd_info->first_dt == vect_reduction_def
 745                 && !STMT_VINFO_DATA_REF (stmt_info)
 746                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 747                 && def_stmt_info
 748                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 749                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 750                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 751           || (!STMT_VINFO_DATA_REF (stmt_info)
 752               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 753               && ((!def_stmt_info
 754                    || STMT_VINFO_DATA_REF (def_stmt_info)
 755                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 756                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 757                   != (oprnd_info->first_dt != vect_reduction_def))))
 758         {
 759           /* Try swapping operands if we got a mismatch.  For BB
 760              vectorization only in case it will clearly improve things.  */
 761           if (i == commutative_op && !swapped
 762               && (!is_a <bb_vec_info> (vinfo)
 763                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 764                                              dts[i+1])
 765                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 766                           || vect_def_types_match
 767                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 768             {
 769               if (dump_enabled_p ())
 770                 dump_printf_loc (MSG_NOTE, vect_location,
 771                                  "trying swapped operands\n");
 772               std::swap (dts[i], dts[i+1]);
 773               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 774                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 775               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 776                          (*oprnds_info)[i+1]->ops[stmt_num]);
 777               swapped = true;
 778               continue;
 779             }
 780
 781           if (is_a <bb_vec_info> (vinfo)
 782               && !oprnd_info->any_pattern)
 783             {
 784               /* Now for commutative ops we should see whether we can
 785                  make the other operand matching.  */
 786               if (dump_enabled_p ())
 787                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 788                                  "treating operand as external\n");
 789               oprnd_info->first_dt = dt = vect_external_def;
 790             }
 791           else
 792             {
 793               if (dump_enabled_p ())
 794                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 795                                  "Build SLP failed: different types\n");
 796               return 1;
 797             }
 798         }
 799
 800       /* Make sure to demote the overall operand to external.  */
 801       if (dt == vect_external_def)
 802         oprnd_info->first_dt = vect_external_def;
 803       /* For a SLP reduction chain we want to duplicate the reduction to
 804          each of the chain members.  That gets us a sane SLP graph (still
 805          the stmts are not 100% correct wrt the initial values).  */
 806       else if ((dt == vect_internal_def
 807                 || dt == vect_reduction_def)
 808                && oprnd_info->first_dt == vect_reduction_def
 809                && !STMT_VINFO_DATA_REF (stmt_info)
 810                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 811                && !STMT_VINFO_DATA_REF (def_stmt_info)
 812                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 813                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 814         {
 815           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 816           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 817         }
 818
 819       ++i;
 820     }
 821
 822   /* Swap operands.  */
 823   if (swapped)
 824     {
 825       if (dump_enabled_p ())
 826         dump_printf_loc (MSG_NOTE, vect_location,
 827                          "swapped operands to match def types in %G",
 828                          stmt_info->stmt);
 829     }
 830
 831   return 0;
 832 }
 833
 834 /* Return true if call statements CALL1 and CALL2 are similar enough
 835    to be combined into the same SLP group.  */
 836
 837 bool
 838 compatible_calls_p (gcall *call1, gcall *call2)
 839 {
 840   unsigned int nargs = gimple_call_num_args (call1);
 841   if (nargs != gimple_call_num_args (call2))
 842     return false;
 843
 844   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 845     return false;
 846
 847   if (gimple_call_internal_p (call1))
 848     {
 849       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 850                                TREE_TYPE (gimple_call_lhs (call2))))
 851         return false;
 852       for (unsigned int i = 0; i < nargs; ++i)
 853         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 854                                  TREE_TYPE (gimple_call_arg (call2, i))))
 855           return false;
 856     }
 857   else
 858     {
 859       if (!operand_equal_p (gimple_call_fn (call1),
 860                             gimple_call_fn (call2), 0))
 861         return false;
 862
 863       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 864         return false;
 865     }
 866
 867   /* Check that any unvectorized arguments are equal.  */
 868   if (const int *map = vect_get_operand_map (call1))
 869     {
 870       unsigned int nkept = *map++;
 871       unsigned int mapi = 0;
 872       for (unsigned int i = 0; i < nargs; ++i)
 873         if (mapi < nkept && map[mapi] == int (i))
 874           mapi += 1;
 875         else if (!operand_equal_p (gimple_call_arg (call1, i),
 876                                    gimple_call_arg (call2, i)))
 877           return false;
 878     }
 879
 880   return true;
 881 }
 882
 883 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 884    caller's attempt to find the vector type in STMT_INFO with the narrowest
 885    element type.  Return true if VECTYPE is nonnull and if it is valid
 886    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 887    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 888    vect_build_slp_tree.  */
 889
 890 static bool
 891 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 892                         unsigned int group_size,
 893                         tree vectype, poly_uint64 *max_nunits)
 894 {
 895   if (!vectype)
 896     {
 897       if (dump_enabled_p ())
 898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 899                          "Build SLP failed: unsupported data-type in %G\n",
 900                          stmt_info->stmt);
 901       /* Fatal mismatch.  */
 902       return false;
 903     }
 904
 905   /* If populating the vector type requires unrolling then fail
 906      before adjusting *max_nunits for basic-block vectorization.  */
 907   if (is_a <bb_vec_info> (vinfo)
 908       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 909     {
 910       if (dump_enabled_p ())
 911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                          "Build SLP failed: unrolling required "
 913                          "in basic block SLP\n");
 914       /* Fatal mismatch.  */
 915       return false;
 916     }
 917
 918   /* In case of multiple types we need to detect the smallest type.  */
 919   vect_update_max_nunits (max_nunits, vectype);
 920   return true;
 921 }
 922
 923 /* Verify if the scalar stmts STMTS are isomorphic, require data
 924    permutation or are of unsupported types of operation.  Return
 925    true if they are, otherwise return false and indicate in *MATCHES
 926    which stmts are not isomorphic to the first one.  If MATCHES[0]
 927    is false then this indicates the comparison could not be
 928    carried out or the stmts will never be vectorized by SLP.
 929
 930    Note COND_EXPR is possibly isomorphic to another one after swapping its
 931    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 932    the first stmt by swapping the two operands of comparison; set SWAP[i]
 933    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 934    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 935    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 936
 937 static bool
 938 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 939                        vec<stmt_vec_info> stmts, unsigned int group_size,
 940                        poly_uint64 *max_nunits, bool *matches,
 941                        bool *two_operators, tree *node_vectype)
 942 {
 943   unsigned int i;
 944   stmt_vec_info first_stmt_info = stmts[0];
 945   code_helper first_stmt_code = ERROR_MARK;
 946   code_helper alt_stmt_code = ERROR_MARK;
 947   code_helper rhs_code = ERROR_MARK;
 948   code_helper first_cond_code = ERROR_MARK;
 949   tree lhs;
 950   bool need_same_oprnds = false;
 951   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 952   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 953   bool first_stmt_load_p = false, load_p = false;
 954   bool first_stmt_phi_p = false, phi_p = false;
 955   bool maybe_soft_fail = false;
 956   tree soft_fail_nunits_vectype = NULL_TREE;
 957
 958   /* For every stmt in NODE find its def stmt/s.  */
 959   stmt_vec_info stmt_info;
 960   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 961     {
 962       gimple *stmt = stmt_info->stmt;
 963       swap[i] = 0;
 964       matches[i] = false;
 965
 966       if (dump_enabled_p ())
 967         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 968
 969       /* Fail to vectorize statements marked as unvectorizable, throw
 970          or are volatile.  */
 971       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
 972           || stmt_can_throw_internal (cfun, stmt)
 973           || gimple_has_volatile_ops (stmt))
 974         {
 975           if (dump_enabled_p ())
 976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 977                              "Build SLP failed: unvectorizable statement %G",
 978                              stmt);
 979           /* ???  For BB vectorization we want to commutate operands in a way
 980              to shuffle all unvectorizable defs into one operand and have
 981              the other still vectorized.  The following doesn't reliably
 982              work for this though but it's the easiest we can do here.  */
 983           if (is_a <bb_vec_info> (vinfo) && i != 0)
 984             continue;
 985           /* Fatal mismatch.  */
 986           matches[0] = false;
 987           return false;
 988         }
 989
 990       lhs = gimple_get_lhs (stmt);
 991       if (lhs == NULL_TREE)
 992         {
 993           if (dump_enabled_p ())
 994             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 995                              "Build SLP failed: not GIMPLE_ASSIGN nor "
 996                              "GIMPLE_CALL %G", stmt);
 997           if (is_a <bb_vec_info> (vinfo) && i != 0)
 998             continue;
 999           /* Fatal mismatch.  */
1000           matches[0] = false;
1001           return false;
1002         }
1003
1004       tree nunits_vectype;
1005       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1006                                            &nunits_vectype, group_size))
1007         {
1008           if (is_a <bb_vec_info> (vinfo) && i != 0)
1009             continue;
1010           /* Fatal mismatch.  */
1011           matches[0] = false;
1012           return false;
1013         }
1014       /* Record nunits required but continue analysis, producing matches[]
1015          as if nunits was not an issue.  This allows splitting of groups
1016          to happen.  */
1017       if (nunits_vectype
1018           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1019                                       nunits_vectype, max_nunits))
1020         {
1021           gcc_assert (is_a <bb_vec_info> (vinfo));
1022           maybe_soft_fail = true;
1023           soft_fail_nunits_vectype = nunits_vectype;
1024         }
1025
1026       gcc_assert (vectype);
1027
1028       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1029       if (call_stmt)
1030         {
1031           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1032           if (cfn != CFN_LAST)
1033             rhs_code = cfn;
1034           else
1035             rhs_code = CALL_EXPR;
1036
1037           if (cfn == CFN_MASK_LOAD
1038               || cfn == CFN_GATHER_LOAD
1039               || cfn == CFN_MASK_GATHER_LOAD)
1040             load_p = true;
1041           else if ((internal_fn_p (cfn)
1042                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1043                    || gimple_call_tail_p (call_stmt)
1044                    || gimple_call_noreturn_p (call_stmt)
1045                    || gimple_call_chain (call_stmt))
1046             {
1047               if (dump_enabled_p ())
1048                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049                                  "Build SLP failed: unsupported call type %G",
1050                                  (gimple *) call_stmt);
1051               if (is_a <bb_vec_info> (vinfo) && i != 0)
1052                 continue;
1053               /* Fatal mismatch.  */
1054               matches[0] = false;
1055               return false;
1056             }
1057         }
1058       else if (gimple_code (stmt) == GIMPLE_PHI)
1059         {
1060           rhs_code = ERROR_MARK;
1061           phi_p = true;
1062         }
1063       else
1064         {
1065           rhs_code = gimple_assign_rhs_code (stmt);
1066           load_p = gimple_vuse (stmt);
1067         }
1068
1069       /* Check the operation.  */
1070       if (i == 0)
1071         {
1072           *node_vectype = vectype;
1073           first_stmt_code = rhs_code;
1074           first_stmt_load_p = load_p;
1075           first_stmt_phi_p = phi_p;
1076
1077           /* Shift arguments should be equal in all the packed stmts for a
1078              vector shift with scalar shift operand.  */
1079           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1080               || rhs_code == LROTATE_EXPR
1081               || rhs_code == RROTATE_EXPR)
1082             {
1083               /* First see if we have a vector/vector shift.  */
1084               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1085                 {
1086                   /* No vector/vector shift, try for a vector/scalar shift.  */
1087                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1088                     {
1089                       if (dump_enabled_p ())
1090                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                                          "Build SLP failed: "
1092                                          "op not supported by target.\n");
1093                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1094                         continue;
1095                       /* Fatal mismatch.  */
1096                       matches[0] = false;
1097                       return false;
1098                     }
1099                   need_same_oprnds = true;
1100                   first_op1 = gimple_assign_rhs2 (stmt);
1101                 }
1102             }
1103           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1104             {
1105               need_same_oprnds = true;
1106               first_op1 = gimple_assign_rhs2 (stmt);
1107             }
1108           else if (!load_p
1109                    && rhs_code == BIT_FIELD_REF)
1110             {
1111               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1112               if (!is_a <bb_vec_info> (vinfo)
1113                   || TREE_CODE (vec) != SSA_NAME
1114                   /* When the element types are not compatible we pun the
1115                      source to the target vectype which requires equal size.  */
1116                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1117                        || !types_compatible_p (TREE_TYPE (vectype),
1118                                                TREE_TYPE (TREE_TYPE (vec))))
1119                       && !operand_equal_p (TYPE_SIZE (vectype),
1120                                            TYPE_SIZE (TREE_TYPE (vec)))))
1121                 {
1122                   if (dump_enabled_p ())
1123                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124                                      "Build SLP failed: "
1125                                      "BIT_FIELD_REF not supported\n");
1126                   /* Fatal mismatch.  */
1127                   matches[0] = false;
1128                   return false;
1129                 }
1130             }
1131           else if (rhs_code == CFN_DIV_POW2)
1132             {
1133               need_same_oprnds = true;
1134               first_op1 = gimple_call_arg (call_stmt, 1);
1135             }
1136         }
1137       else
1138         {
1139           if (first_stmt_code != rhs_code
1140               && alt_stmt_code == ERROR_MARK)
1141             alt_stmt_code = rhs_code;
1142           if ((first_stmt_code != rhs_code
1143                && (first_stmt_code != IMAGPART_EXPR
1144                    || rhs_code != REALPART_EXPR)
1145                && (first_stmt_code != REALPART_EXPR
1146                    || rhs_code != IMAGPART_EXPR)
1147                /* Handle mismatches in plus/minus by computing both
1148                   and merging the results.  */
1149                && !((first_stmt_code == PLUS_EXPR
1150                      || first_stmt_code == MINUS_EXPR)
1151                     && (alt_stmt_code == PLUS_EXPR
1152                         || alt_stmt_code == MINUS_EXPR)
1153                     && rhs_code == alt_stmt_code)
1154                && !(first_stmt_code.is_tree_code ()
1155                     && rhs_code.is_tree_code ()
1156                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1157                         == tcc_comparison)
1158                     && (swap_tree_comparison (tree_code (first_stmt_code))
1159                         == tree_code (rhs_code)))
1160                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1161                     && (first_stmt_code == ARRAY_REF
1162                         || first_stmt_code == BIT_FIELD_REF
1163                         || first_stmt_code == INDIRECT_REF
1164                         || first_stmt_code == COMPONENT_REF
1165                         || first_stmt_code == MEM_REF)
1166                     && (rhs_code == ARRAY_REF
1167                         || rhs_code == BIT_FIELD_REF
1168                         || rhs_code == INDIRECT_REF
1169                         || rhs_code == COMPONENT_REF
1170                         || rhs_code == MEM_REF)))
1171               || first_stmt_load_p != load_p
1172               || first_stmt_phi_p != phi_p)
1173             {
1174               if (dump_enabled_p ())
1175                 {
1176                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                                    "Build SLP failed: different operation "
1178                                    "in stmt %G", stmt);
1179                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                                    "original stmt %G", first_stmt_info->stmt);
1181                 }
1182               /* Mismatch.  */
1183               continue;
1184             }
1185
1186           if (!load_p
1187               && first_stmt_code == BIT_FIELD_REF
1188               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1189                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1190             {
1191               if (dump_enabled_p ())
1192                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                                  "Build SLP failed: different BIT_FIELD_REF "
1194                                  "arguments in %G", stmt);
1195               /* Mismatch.  */
1196               continue;
1197             }
1198
1199           if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1200             {
1201               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1202                                        call_stmt))
1203                 {
1204                   if (dump_enabled_p ())
1205                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                                      "Build SLP failed: different calls in %G",
1207                                      stmt);
1208                   /* Mismatch.  */
1209                   continue;
1210                 }
1211             }
1212
1213           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1214               && (gimple_bb (first_stmt_info->stmt)
1215                   != gimple_bb (stmt_info->stmt)))
1216             {
1217               if (dump_enabled_p ())
1218                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                                  "Build SLP failed: different BB for PHI "
1220                                  "or possibly trapping operation in %G", stmt);
1221               /* Mismatch.  */
1222               continue;
1223             }
1224
1225           if (need_same_oprnds)
1226             {
1227               tree other_op1 = gimple_arg (stmt, 1);
1228               if (!operand_equal_p (first_op1, other_op1, 0))
1229                 {
1230                   if (dump_enabled_p ())
1231                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                                      "Build SLP failed: different shift "
1233                                      "arguments in %G", stmt);
1234                   /* Mismatch.  */
1235                   continue;
1236                 }
1237             }
1238
1239           if (!types_compatible_p (vectype, *node_vectype))
1240             {
1241               if (dump_enabled_p ())
1242                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                                  "Build SLP failed: different vector type "
1244                                  "in %G", stmt);
1245               /* Mismatch.  */
1246               continue;
1247             }
1248         }
1249
1250       /* Grouped store or load.  */
1251       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1252         {
1253           if (REFERENCE_CLASS_P (lhs))
1254             {
1255               /* Store.  */
1256               ;
1257             }
1258           else
1259             {
1260               /* Load.  */
1261               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1262               if (prev_first_load)
1263                 {
1264                   /* Check that there are no loads from different interleaving
1265                      chains in the same node.  */
1266                   if (prev_first_load != first_load)
1267                     {
1268                       if (dump_enabled_p ())
1269                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1270                                          vect_location,
1271                                          "Build SLP failed: different "
1272                                          "interleaving chains in one node %G",
1273                                          stmt);
1274                       /* Mismatch.  */
1275                       continue;
1276                     }
1277                 }
1278               else
1279                 prev_first_load = first_load;
1280            }
1281         } /* Grouped access.  */
1282       else
1283         {
1284           if (load_p
1285               && rhs_code != CFN_GATHER_LOAD
1286               && rhs_code != CFN_MASK_GATHER_LOAD)
1287             {
1288               /* Not grouped load.  */
1289               if (dump_enabled_p ())
1290                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                                  "Build SLP failed: not grouped load %G", stmt);
1292
1293               /* FORNOW: Not grouped loads are not supported.  */
1294               if (is_a <bb_vec_info> (vinfo) && i != 0)
1295                 continue;
1296               /* Fatal mismatch.  */
1297               matches[0] = false;
1298               return false;
1299             }
1300
1301           /* Not memory operation.  */
1302           if (!phi_p
1303               && rhs_code.is_tree_code ()
1304               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1305               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1306               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1307               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1308               && rhs_code != VIEW_CONVERT_EXPR
1309               && rhs_code != CALL_EXPR
1310               && rhs_code != BIT_FIELD_REF)
1311             {
1312               if (dump_enabled_p ())
1313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                                  "Build SLP failed: operation unsupported %G",
1315                                  stmt);
1316               if (is_a <bb_vec_info> (vinfo) && i != 0)
1317                 continue;
1318               /* Fatal mismatch.  */
1319               matches[0] = false;
1320               return false;
1321             }
1322
1323           if (rhs_code == COND_EXPR)
1324             {
1325               tree cond_expr = gimple_assign_rhs1 (stmt);
1326               enum tree_code cond_code = TREE_CODE (cond_expr);
1327               enum tree_code swap_code = ERROR_MARK;
1328               enum tree_code invert_code = ERROR_MARK;
1329
1330               if (i == 0)
1331                 first_cond_code = TREE_CODE (cond_expr);
1332               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1333                 {
1334                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1335                   swap_code = swap_tree_comparison (cond_code);
1336                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1337                 }
1338
1339               if (first_cond_code == cond_code)
1340                 ;
1341               /* Isomorphic can be achieved by swapping.  */
1342               else if (first_cond_code == swap_code)
1343                 swap[i] = 1;
1344               /* Isomorphic can be achieved by inverting.  */
1345               else if (first_cond_code == invert_code)
1346                 swap[i] = 2;
1347               else
1348                 {
1349                   if (dump_enabled_p ())
1350                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351                                      "Build SLP failed: different"
1352                                      " operation %G", stmt);
1353                   /* Mismatch.  */
1354                   continue;
1355                 }
1356             }
1357
1358           if (rhs_code.is_tree_code ()
1359               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1360               && (swap_tree_comparison ((tree_code)first_stmt_code)
1361                   == (tree_code)rhs_code))
1362             swap[i] = 1;
1363         }
1364
1365       matches[i] = true;
1366     }
1367
1368   for (i = 0; i < group_size; ++i)
1369     if (!matches[i])
1370       return false;
1371
1372   /* If we allowed a two-operation SLP node verify the target can cope
1373      with the permute we are going to use.  */
1374   if (alt_stmt_code != ERROR_MARK
1375       && (!alt_stmt_code.is_tree_code ()
1376           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1377               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1378     {
1379       *two_operators = true;
1380     }
1381
1382   if (maybe_soft_fail)
1383     {
1384       unsigned HOST_WIDE_INT const_nunits;
1385       if (!TYPE_VECTOR_SUBPARTS
1386             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1387           || const_nunits > group_size)
1388         matches[0] = false;
1389       else
1390         {
1391           /* With constant vector elements simulate a mismatch at the
1392              point we need to split.  */
1393           unsigned tail = group_size & (const_nunits - 1);
1394           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1395         }
1396       return false;
1397     }
1398
1399   return true;
1400 }
1401
1402 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1403    Note we never remove apart from at destruction time so we do not
1404    need a special value for deleted that differs from empty.  */
1405 struct bst_traits
1406 {
1407   typedef vec <stmt_vec_info> value_type;
1408   typedef vec <stmt_vec_info> compare_type;
1409   static inline hashval_t hash (value_type);
1410   static inline bool equal (value_type existing, value_type candidate);
1411   static inline bool is_empty (value_type x) { return !x.exists (); }
1412   static inline bool is_deleted (value_type x) { return !x.exists (); }
1413   static const bool empty_zero_p = true;
1414   static inline void mark_empty (value_type &x) { x.release (); }
1415   static inline void mark_deleted (value_type &x) { x.release (); }
1416   static inline void remove (value_type &x) { x.release (); }
1417 };
1418 inline hashval_t
1419 bst_traits::hash (value_type x)
1420 {
1421   inchash::hash h;
1422   for (unsigned i = 0; i < x.length (); ++i)
1423     h.add_int (gimple_uid (x[i]->stmt));
1424   return h.end ();
1425 }
1426 inline bool
1427 bst_traits::equal (value_type existing, value_type candidate)
1428 {
1429   if (existing.length () != candidate.length ())
1430     return false;
1431   for (unsigned i = 0; i < existing.length (); ++i)
1432     if (existing[i] != candidate[i])
1433       return false;
1434   return true;
1435 }
1436
1437 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1438    but then vec::insert does memmove and that's not compatible with
1439    std::pair.  */
1440 struct chain_op_t
1441 {
1442   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1443       : code (code_), dt (dt_), op (op_) {}
1444   tree_code code;
1445   vect_def_type dt;
1446   tree op;
1447 };
1448
1449 /* Comparator for sorting associatable chains.  */
1450
1451 static int
1452 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1453 {
1454   auto *op1 = (const chain_op_t *) op1_;
1455   auto *op2 = (const chain_op_t *) op2_;
1456   if (op1->dt != op2->dt)
1457     return (int)op1->dt - (int)op2->dt;
1458   return (int)op1->code - (int)op2->code;
1459 }
1460
1461 /* Linearize the associatable expression chain at START with the
1462    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1463    filling CHAIN with the result and using WORKLIST as intermediate storage.
1464    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1465    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1466    stmts, starting with START.  */
1467
1468 static void
1469 vect_slp_linearize_chain (vec_info *vinfo,
1470                           vec<std::pair<tree_code, gimple *> > &worklist,
1471                           vec<chain_op_t> &chain,
1472                           enum tree_code code, gimple *start,
1473                           gimple *&code_stmt, gimple *&alt_code_stmt,
1474                           vec<gimple *> *chain_stmts)
1475 {
1476   /* For each lane linearize the addition/subtraction (or other
1477      uniform associatable operation) expression tree.  */
1478   worklist.safe_push (std::make_pair (code, start));
1479   while (!worklist.is_empty ())
1480     {
1481       auto entry = worklist.pop ();
1482       gassign *stmt = as_a <gassign *> (entry.second);
1483       enum tree_code in_code = entry.first;
1484       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1485       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1486       if (!code_stmt
1487           && gimple_assign_rhs_code (stmt) == code)
1488         code_stmt = stmt;
1489       else if (!alt_code_stmt
1490                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1491         alt_code_stmt = stmt;
1492       if (chain_stmts)
1493         chain_stmts->safe_push (stmt);
1494       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1495         {
1496           tree op = gimple_op (stmt, opnum);
1497           vect_def_type dt;
1498           stmt_vec_info def_stmt_info;
1499           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1500           gcc_assert (res);
1501           if (dt == vect_internal_def
1502               && is_pattern_stmt_p (def_stmt_info))
1503             op = gimple_get_lhs (def_stmt_info->stmt);
1504           gimple *use_stmt;
1505           use_operand_p use_p;
1506           if (dt == vect_internal_def
1507               && single_imm_use (op, &use_p, &use_stmt)
1508               && is_gimple_assign (def_stmt_info->stmt)
1509               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1510                   || (code == PLUS_EXPR
1511                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1512                           == MINUS_EXPR))))
1513             {
1514               tree_code op_def_code = this_code;
1515               if (op_def_code == MINUS_EXPR && opnum == 1)
1516                 op_def_code = PLUS_EXPR;
1517               if (in_code == MINUS_EXPR)
1518                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1519               worklist.safe_push (std::make_pair (op_def_code,
1520                                                   def_stmt_info->stmt));
1521             }
1522           else
1523             {
1524               tree_code op_def_code = this_code;
1525               if (op_def_code == MINUS_EXPR && opnum == 1)
1526                 op_def_code = PLUS_EXPR;
1527               if (in_code == MINUS_EXPR)
1528                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1529               chain.safe_push (chain_op_t (op_def_code, dt, op));
1530             }
1531         }
1532     }
1533 }
1534
1535 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1536                   simple_hashmap_traits <bst_traits, slp_tree> >
1537   scalar_stmts_to_slp_tree_map_t;
1538
1539 static slp_tree
1540 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1541                        vec<stmt_vec_info> stmts, unsigned int group_size,
1542                        poly_uint64 *max_nunits,
1543                        bool *matches, unsigned *limit, unsigned *tree_size,
1544                        scalar_stmts_to_slp_tree_map_t *bst_map);
1545
1546 static slp_tree
1547 vect_build_slp_tree (vec_info *vinfo,
1548                      vec<stmt_vec_info> stmts, unsigned int group_size,
1549                      poly_uint64 *max_nunits,
1550                      bool *matches, unsigned *limit, unsigned *tree_size,
1551                      scalar_stmts_to_slp_tree_map_t *bst_map)
1552 {
1553   if (slp_tree *leader = bst_map->get (stmts))
1554     {
1555       if (dump_enabled_p ())
1556         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1557                          !(*leader)->failed ? "" : "failed ",
1558                          (void *) *leader);
1559       if (!(*leader)->failed)
1560         {
1561           SLP_TREE_REF_COUNT (*leader)++;
1562           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1563           stmts.release ();
1564           return *leader;
1565         }
1566       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1567       return NULL;
1568     }
1569
1570   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1571      so we can pick up backedge destinations during discovery.  */
1572   slp_tree res = new _slp_tree;
1573   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1574   SLP_TREE_SCALAR_STMTS (res) = stmts;
1575   bst_map->put (stmts.copy (), res);
1576
1577   if (*limit == 0)
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "SLP discovery limit exceeded\n");
1582       /* Mark the node invalid so we can detect those when still in use
1583          as backedge destinations.  */
1584       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1585       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1586       res->failed = XNEWVEC (bool, group_size);
1587       memset (res->failed, 0, sizeof (bool) * group_size);
1588       memset (matches, 0, sizeof (bool) * group_size);
1589       return NULL;
1590     }
1591   --*limit;
1592
1593   if (dump_enabled_p ())
1594     dump_printf_loc (MSG_NOTE, vect_location,
1595                      "starting SLP discovery for node %p\n", (void *) res);
1596
1597   poly_uint64 this_max_nunits = 1;
1598   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1599                                         &this_max_nunits,
1600                                         matches, limit, tree_size, bst_map);
1601   if (!res_)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_NOTE, vect_location,
1605                          "SLP discovery for node %p failed\n", (void *) res);
1606       /* Mark the node invalid so we can detect those when still in use
1607          as backedge destinations.  */
1608       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1609       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1610       res->failed = XNEWVEC (bool, group_size);
1611       if (flag_checking)
1612         {
1613           unsigned i;
1614           for (i = 0; i < group_size; ++i)
1615             if (!matches[i])
1616               break;
1617           gcc_assert (i < group_size);
1618         }
1619       memcpy (res->failed, matches, sizeof (bool) * group_size);
1620     }
1621   else
1622     {
1623       if (dump_enabled_p ())
1624         dump_printf_loc (MSG_NOTE, vect_location,
1625                          "SLP discovery for node %p succeeded\n",
1626                          (void *) res);
1627       gcc_assert (res_ == res);
1628       res->max_nunits = this_max_nunits;
1629       vect_update_max_nunits (max_nunits, this_max_nunits);
1630       /* Keep a reference for the bst_map use.  */
1631       SLP_TREE_REF_COUNT (res)++;
1632     }
1633   return res_;
1634 }
1635
1636 /* Helper for building an associated SLP node chain.  */
1637
1638 static void
1639 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1640                                    slp_tree op0, slp_tree op1,
1641                                    stmt_vec_info oper1, stmt_vec_info oper2,
1642                                    vec<std::pair<unsigned, unsigned> > lperm)
1643 {
1644   unsigned group_size = SLP_TREE_LANES (op1);
1645
1646   slp_tree child1 = new _slp_tree;
1647   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1648   SLP_TREE_VECTYPE (child1) = vectype;
1649   SLP_TREE_LANES (child1) = group_size;
1650   SLP_TREE_CHILDREN (child1).create (2);
1651   SLP_TREE_CHILDREN (child1).quick_push (op0);
1652   SLP_TREE_CHILDREN (child1).quick_push (op1);
1653   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1654
1655   slp_tree child2 = new _slp_tree;
1656   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1657   SLP_TREE_VECTYPE (child2) = vectype;
1658   SLP_TREE_LANES (child2) = group_size;
1659   SLP_TREE_CHILDREN (child2).create (2);
1660   SLP_TREE_CHILDREN (child2).quick_push (op0);
1661   SLP_TREE_REF_COUNT (op0)++;
1662   SLP_TREE_CHILDREN (child2).quick_push (op1);
1663   SLP_TREE_REF_COUNT (op1)++;
1664   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1665
1666   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1667   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1668   SLP_TREE_VECTYPE (perm) = vectype;
1669   SLP_TREE_LANES (perm) = group_size;
1670   /* ???  We should set this NULL but that's not expected.  */
1671   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1672   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1673   SLP_TREE_CHILDREN (perm).quick_push (child1);
1674   SLP_TREE_CHILDREN (perm).quick_push (child2);
1675 }
1676
1677 /* Recursively build an SLP tree starting from NODE.
1678    Fail (and return a value not equal to zero) if def-stmts are not
1679    isomorphic, require data permutation or are of unsupported types of
1680    operation.  Otherwise, return 0.
1681    The value returned is the depth in the SLP tree where a mismatch
1682    was found.  */
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map)
1690 {
1691   unsigned nops, i, this_tree_size = 0;
1692   poly_uint64 this_max_nunits = *max_nunits;
1693
1694   matches[0] = false;
1695
1696   stmt_vec_info stmt_info = stmts[0];
1697   if (!is_a<gcall *> (stmt_info->stmt)
1698       && !is_a<gassign *> (stmt_info->stmt)
1699       && !is_a<gphi *> (stmt_info->stmt))
1700     return NULL;
1701
1702   nops = gimple_num_args (stmt_info->stmt);
1703   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1704     nops = map[0];
1705
1706   /* If the SLP node is a PHI (induction or reduction), terminate
1707      the recursion.  */
1708   bool *skip_args = XALLOCAVEC (bool, nops);
1709   memset (skip_args, 0, sizeof (bool) * nops);
1710   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1711     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1712       {
1713         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1714         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1715                                                     group_size);
1716         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1717                                      max_nunits))
1718           return NULL;
1719
1720         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1721         if (def_type == vect_induction_def)
1722           {
1723             /* Induction PHIs are not cycles but walk the initial
1724                value.  Only for inner loops through, for outer loops
1725                we need to pick up the value from the actual PHIs
1726                to more easily support peeling and epilogue vectorization.  */
1727             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1728             if (!nested_in_vect_loop_p (loop, stmt_info))
1729               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1730             else
1731               loop = loop->inner;
1732             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1733           }
1734         else if (def_type == vect_reduction_def
1735                  || def_type == vect_double_reduction_def
1736                  || def_type == vect_nested_cycle
1737                  || def_type == vect_first_order_recurrence)
1738           {
1739             /* Else def types have to match.  */
1740             stmt_vec_info other_info;
1741             bool all_same = true;
1742             FOR_EACH_VEC_ELT (stmts, i, other_info)
1743               {
1744                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1745                   return NULL;
1746                 if (other_info != stmt_info)
1747                   all_same = false;
1748               }
1749             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750             /* Reduction initial values are not explicitely represented.  */
1751             if (def_type != vect_first_order_recurrence
1752                 && !nested_in_vect_loop_p (loop, stmt_info))
1753               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1754             /* Reduction chain backedge defs are filled manually.
1755                ???  Need a better way to identify a SLP reduction chain PHI.
1756                Or a better overall way to SLP match those.  */
1757             if (all_same && def_type == vect_reduction_def)
1758               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1759           }
1760         else if (def_type != vect_internal_def)
1761           return NULL;
1762       }
1763
1764
1765   bool two_operators = false;
1766   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1767   tree vectype = NULL_TREE;
1768   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1769                               &this_max_nunits, matches, &two_operators,
1770                               &vectype))
1771     return NULL;
1772
1773   /* If the SLP node is a load, terminate the recursion unless masked.  */
1774   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1775       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1776     {
1777       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1778         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1779                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1780                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1781       else
1782         {
1783           *max_nunits = this_max_nunits;
1784           (*tree_size)++;
1785           node = vect_create_new_slp_node (node, stmts, 0);
1786           SLP_TREE_VECTYPE (node) = vectype;
1787           /* And compute the load permutation.  Whether it is actually
1788              a permutation depends on the unrolling factor which is
1789              decided later.  */
1790           vec<unsigned> load_permutation;
1791           int j;
1792           stmt_vec_info load_info;
1793           load_permutation.create (group_size);
1794           stmt_vec_info first_stmt_info
1795             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1796           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1797             {
1798               int load_place = vect_get_place_in_interleaving_chain
1799                   (load_info, first_stmt_info);
1800               gcc_assert (load_place != -1);
1801               load_permutation.safe_push (load_place);
1802             }
1803           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1804           return node;
1805         }
1806     }
1807   else if (gimple_assign_single_p (stmt_info->stmt)
1808            && !gimple_vuse (stmt_info->stmt)
1809            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1810     {
1811       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1812          the same SSA name vector of a compatible type to vectype.  */
1813       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1814       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1815       stmt_vec_info estmt_info;
1816       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1817         {
1818           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1819           tree bfref = gimple_assign_rhs1 (estmt);
1820           HOST_WIDE_INT lane;
1821           if (!known_eq (bit_field_size (bfref),
1822                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1823               || !constant_multiple_p (bit_field_offset (bfref),
1824                                        bit_field_size (bfref), &lane))
1825             {
1826               lperm.release ();
1827               matches[0] = false;
1828               return NULL;
1829             }
1830           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1831         }
1832       slp_tree vnode = vect_create_new_slp_node (vNULL);
1833       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1834         /* ???  We record vectype here but we hide eventually necessary
1835            punning and instead rely on code generation to materialize
1836            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1837            this explicit somehow.  */
1838         SLP_TREE_VECTYPE (vnode) = vectype;
1839       else
1840         {
1841           /* For different size but compatible elements we can still
1842              use VEC_PERM_EXPR without punning.  */
1843           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1844                       && types_compatible_p (TREE_TYPE (vectype),
1845                                              TREE_TYPE (TREE_TYPE (vec))));
1846           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1847         }
1848       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1849       unsigned HOST_WIDE_INT const_nunits;
1850       if (nunits.is_constant (&const_nunits))
1851         SLP_TREE_LANES (vnode) = const_nunits;
1852       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1853       /* We are always building a permutation node even if it is an identity
1854          permute to shield the rest of the vectorizer from the odd node
1855          representing an actual vector without any scalar ops.
1856          ???  We could hide it completely with making the permute node
1857          external?  */
1858       node = vect_create_new_slp_node (node, stmts, 1);
1859       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1860       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1861       SLP_TREE_VECTYPE (node) = vectype;
1862       SLP_TREE_CHILDREN (node).quick_push (vnode);
1863       return node;
1864     }
1865   /* When discovery reaches an associatable operation see whether we can
1866      improve that to match up lanes in a way superior to the operand
1867      swapping code which at most looks at two defs.
1868      ???  For BB vectorization we cannot do the brute-force search
1869      for matching as we can succeed by means of builds from scalars
1870      and have no good way to "cost" one build against another.  */
1871   else if (is_a <loop_vec_info> (vinfo)
1872            /* ???  We don't handle !vect_internal_def defs below.  */
1873            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1874            && is_gimple_assign (stmt_info->stmt)
1875            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1876                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1877            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1878                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1879                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1880     {
1881       /* See if we have a chain of (mixed) adds or subtracts or other
1882          associatable ops.  */
1883       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1884       if (code == MINUS_EXPR)
1885         code = PLUS_EXPR;
1886       stmt_vec_info other_op_stmt_info = NULL;
1887       stmt_vec_info op_stmt_info = NULL;
1888       unsigned chain_len = 0;
1889       auto_vec<chain_op_t> chain;
1890       auto_vec<std::pair<tree_code, gimple *> > worklist;
1891       auto_vec<vec<chain_op_t> > chains (group_size);
1892       auto_vec<slp_tree, 4> children;
1893       bool hard_fail = true;
1894       for (unsigned lane = 0; lane < group_size; ++lane)
1895         {
1896           /* For each lane linearize the addition/subtraction (or other
1897              uniform associatable operation) expression tree.  */
1898           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1899           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1900                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1901                                     NULL);
1902           if (!op_stmt_info && op_stmt)
1903             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1904           if (!other_op_stmt_info && other_op_stmt)
1905             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1906           if (chain.length () == 2)
1907             {
1908               /* In a chain of just two elements resort to the regular
1909                  operand swapping scheme.  If we run into a length
1910                  mismatch still hard-FAIL.  */
1911               if (chain_len == 0)
1912                 hard_fail = false;
1913               else
1914                 {
1915                   matches[lane] = false;
1916                   /* ???  We might want to process the other lanes, but
1917                      make sure to not give false matching hints to the
1918                      caller for lanes we did not process.  */
1919                   if (lane != group_size - 1)
1920                     matches[0] = false;
1921                 }
1922               break;
1923             }
1924           else if (chain_len == 0)
1925             chain_len = chain.length ();
1926           else if (chain.length () != chain_len)
1927             {
1928               /* ???  Here we could slip in magic to compensate with
1929                  neutral operands.  */
1930               matches[lane] = false;
1931               if (lane != group_size - 1)
1932                 matches[0] = false;
1933               break;
1934             }
1935           chains.quick_push (chain.copy ());
1936           chain.truncate (0);
1937         }
1938       if (chains.length () == group_size)
1939         {
1940           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1941           if (!op_stmt_info)
1942             {
1943               hard_fail = false;
1944               goto out;
1945             }
1946           /* Now we have a set of chains with the same length.  */
1947           /* 1. pre-sort according to def_type and operation.  */
1948           for (unsigned lane = 0; lane < group_size; ++lane)
1949             chains[lane].stablesort (dt_sort_cmp, vinfo);
1950           if (dump_enabled_p ())
1951             {
1952               dump_printf_loc (MSG_NOTE, vect_location,
1953                                "pre-sorted chains of %s\n",
1954                                get_tree_code_name (code));
1955               for (unsigned lane = 0; lane < group_size; ++lane)
1956                 {
1957                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1958                     dump_printf (MSG_NOTE, "%s %T ",
1959                                  get_tree_code_name (chains[lane][opnum].code),
1960                                  chains[lane][opnum].op);
1961                   dump_printf (MSG_NOTE, "\n");
1962                 }
1963             }
1964           /* 2. try to build children nodes, associating as necessary.  */
1965           for (unsigned n = 0; n < chain_len; ++n)
1966             {
1967               vect_def_type dt = chains[0][n].dt;
1968               unsigned lane;
1969               for (lane = 0; lane < group_size; ++lane)
1970                 if (chains[lane][n].dt != dt)
1971                   {
1972                     if (dt == vect_constant_def
1973                         && chains[lane][n].dt == vect_external_def)
1974                       dt = vect_external_def;
1975                     else if (dt == vect_external_def
1976                              && chains[lane][n].dt == vect_constant_def)
1977                       ;
1978                     else
1979                       break;
1980                   }
1981               if (lane != group_size)
1982                 {
1983                   if (dump_enabled_p ())
1984                     dump_printf_loc (MSG_NOTE, vect_location,
1985                                      "giving up on chain due to mismatched "
1986                                      "def types\n");
1987                   matches[lane] = false;
1988                   if (lane != group_size - 1)
1989                     matches[0] = false;
1990                   goto out;
1991                 }
1992               if (dt == vect_constant_def
1993                   || dt == vect_external_def)
1994                 {
1995                   /* Check whether we can build the invariant.  If we can't
1996                      we never will be able to.  */
1997                   tree type = TREE_TYPE (chains[0][n].op);
1998                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1999                       && (TREE_CODE (type) == BOOLEAN_TYPE
2000                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2001                                                               type)))
2002                     {
2003                       matches[0] = false;
2004                       goto out;
2005                     }
2006                   vec<tree> ops;
2007                   ops.create (group_size);
2008                   for (lane = 0; lane < group_size; ++lane)
2009                     ops.quick_push (chains[lane][n].op);
2010                   slp_tree child = vect_create_new_slp_node (ops);
2011                   SLP_TREE_DEF_TYPE (child) = dt;
2012                   children.safe_push (child);
2013                 }
2014               else if (dt != vect_internal_def)
2015                 {
2016                   /* Not sure, we might need sth special.
2017                      gcc.dg/vect/pr96854.c,
2018                      gfortran.dg/vect/fast-math-pr37021.f90
2019                      and gfortran.dg/vect/pr61171.f trigger.  */
2020                   /* Soft-fail for now.  */
2021                   hard_fail = false;
2022                   goto out;
2023                 }
2024               else
2025                 {
2026                   vec<stmt_vec_info> op_stmts;
2027                   op_stmts.create (group_size);
2028                   slp_tree child = NULL;
2029                   /* Brute-force our way.  We have to consider a lane
2030                      failing after fixing an earlier fail up in the
2031                      SLP discovery recursion.  So track the current
2032                      permute per lane.  */
2033                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2034                   memset (perms, 0, sizeof (unsigned) * group_size);
2035                   do
2036                     {
2037                       op_stmts.truncate (0);
2038                       for (lane = 0; lane < group_size; ++lane)
2039                         op_stmts.quick_push
2040                           (vinfo->lookup_def (chains[lane][n].op));
2041                       child = vect_build_slp_tree (vinfo, op_stmts,
2042                                                    group_size, &this_max_nunits,
2043                                                    matches, limit,
2044                                                    &this_tree_size, bst_map);
2045                       /* ???  We're likely getting too many fatal mismatches
2046                          here so maybe we want to ignore them (but then we
2047                          have no idea which lanes fatally mismatched).  */
2048                       if (child || !matches[0])
2049                         break;
2050                       /* Swap another lane we have not yet matched up into
2051                          lanes that did not match.  If we run out of
2052                          permute possibilities for a lane terminate the
2053                          search.  */
2054                       bool term = false;
2055                       for (lane = 1; lane < group_size; ++lane)
2056                         if (!matches[lane])
2057                           {
2058                             if (n + perms[lane] + 1 == chain_len)
2059                               {
2060                                 term = true;
2061                                 break;
2062                               }
2063                             std::swap (chains[lane][n],
2064                                        chains[lane][n + perms[lane] + 1]);
2065                             perms[lane]++;
2066                           }
2067                       if (term)
2068                         break;
2069                     }
2070                   while (1);
2071                   if (!child)
2072                     {
2073                       if (dump_enabled_p ())
2074                         dump_printf_loc (MSG_NOTE, vect_location,
2075                                          "failed to match up op %d\n", n);
2076                       op_stmts.release ();
2077                       if (lane != group_size - 1)
2078                         matches[0] = false;
2079                       else
2080                         matches[lane] = false;
2081                       goto out;
2082                     }
2083                   if (dump_enabled_p ())
2084                     {
2085                       dump_printf_loc (MSG_NOTE, vect_location,
2086                                        "matched up op %d to\n", n);
2087                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2088                     }
2089                   children.safe_push (child);
2090                 }
2091             }
2092           /* 3. build SLP nodes to combine the chain.  */
2093           for (unsigned lane = 0; lane < group_size; ++lane)
2094             if (chains[lane][0].code != code)
2095               {
2096                 /* See if there's any alternate all-PLUS entry.  */
2097                 unsigned n;
2098                 for (n = 1; n < chain_len; ++n)
2099                   {
2100                     for (lane = 0; lane < group_size; ++lane)
2101                       if (chains[lane][n].code != code)
2102                         break;
2103                     if (lane == group_size)
2104                       break;
2105                   }
2106                 if (n != chain_len)
2107                   {
2108                     /* Swap that in at first position.  */
2109                     std::swap (children[0], children[n]);
2110                     for (lane = 0; lane < group_size; ++lane)
2111                       std::swap (chains[lane][0], chains[lane][n]);
2112                   }
2113                 else
2114                   {
2115                     /* ???  When this triggers and we end up with two
2116                        vect_constant/external_def up-front things break (ICE)
2117                        spectacularly finding an insertion place for the
2118                        all-constant op.  We should have a fully
2119                        vect_internal_def operand though(?) so we can swap
2120                        that into first place and then prepend the all-zero
2121                        constant.  */
2122                     if (dump_enabled_p ())
2123                       dump_printf_loc (MSG_NOTE, vect_location,
2124                                        "inserting constant zero to compensate "
2125                                        "for (partially) negated first "
2126                                        "operand\n");
2127                     chain_len++;
2128                     for (lane = 0; lane < group_size; ++lane)
2129                       chains[lane].safe_insert
2130                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2131                     vec<tree> zero_ops;
2132                     zero_ops.create (group_size);
2133                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2134                     for (lane = 1; lane < group_size; ++lane)
2135                       zero_ops.quick_push (zero_ops[0]);
2136                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2137                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2138                     children.safe_insert (0, zero);
2139                   }
2140                 break;
2141               }
2142           for (unsigned i = 1; i < children.length (); ++i)
2143             {
2144               slp_tree op0 = children[i - 1];
2145               slp_tree op1 = children[i];
2146               bool this_two_op = false;
2147               for (unsigned lane = 0; lane < group_size; ++lane)
2148                 if (chains[lane][i].code != chains[0][i].code)
2149                   {
2150                     this_two_op = true;
2151                     break;
2152                   }
2153               slp_tree child;
2154               if (i == children.length () - 1)
2155                 child = vect_create_new_slp_node (node, stmts, 2);
2156               else
2157                 child = vect_create_new_slp_node (2, ERROR_MARK);
2158               if (this_two_op)
2159                 {
2160                   vec<std::pair<unsigned, unsigned> > lperm;
2161                   lperm.create (group_size);
2162                   for (unsigned lane = 0; lane < group_size; ++lane)
2163                     lperm.quick_push (std::make_pair
2164                       (chains[lane][i].code != chains[0][i].code, lane));
2165                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2166                                                      (chains[0][i].code == code
2167                                                       ? op_stmt_info
2168                                                       : other_op_stmt_info),
2169                                                      (chains[0][i].code == code
2170                                                       ? other_op_stmt_info
2171                                                       : op_stmt_info),
2172                                                      lperm);
2173                 }
2174               else
2175                 {
2176                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2177                   SLP_TREE_VECTYPE (child) = vectype;
2178                   SLP_TREE_LANES (child) = group_size;
2179                   SLP_TREE_CHILDREN (child).quick_push (op0);
2180                   SLP_TREE_CHILDREN (child).quick_push (op1);
2181                   SLP_TREE_REPRESENTATIVE (child)
2182                     = (chains[0][i].code == code
2183                        ? op_stmt_info : other_op_stmt_info);
2184                 }
2185               children[i] = child;
2186             }
2187           *tree_size += this_tree_size + 1;
2188           *max_nunits = this_max_nunits;
2189           while (!chains.is_empty ())
2190             chains.pop ().release ();
2191           return node;
2192         }
2193 out:
2194       while (!children.is_empty ())
2195         vect_free_slp_tree (children.pop ());
2196       while (!chains.is_empty ())
2197         chains.pop ().release ();
2198       /* Hard-fail, otherwise we might run into quadratic processing of the
2199          chains starting one stmt into the chain again.  */
2200       if (hard_fail)
2201         return NULL;
2202       /* Fall thru to normal processing.  */
2203     }
2204
2205   /* Get at the operands, verifying they are compatible.  */
2206   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2207   slp_oprnd_info oprnd_info;
2208   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2209     {
2210       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2211                                              stmts, i, &oprnds_info);
2212       if (res != 0)
2213         matches[(res == -1) ? 0 : i] = false;
2214       if (!matches[0])
2215         break;
2216     }
2217   for (i = 0; i < group_size; ++i)
2218     if (!matches[i])
2219       {
2220         vect_free_oprnd_info (oprnds_info);
2221         return NULL;
2222       }
2223   swap = NULL;
2224
2225   auto_vec<slp_tree, 4> children;
2226
2227   stmt_info = stmts[0];
2228
2229   /* Create SLP_TREE nodes for the definition node/s.  */
2230   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2231     {
2232       slp_tree child;
2233       unsigned int j;
2234
2235       /* We're skipping certain operands from processing, for example
2236          outer loop reduction initial defs.  */
2237       if (skip_args[i])
2238         {
2239           children.safe_push (NULL);
2240           continue;
2241         }
2242
2243       if (oprnd_info->first_dt == vect_uninitialized_def)
2244         {
2245           /* COND_EXPR have one too many eventually if the condition
2246              is a SSA name.  */
2247           gcc_assert (i == 3 && nops == 4);
2248           continue;
2249         }
2250
2251       if (is_a <bb_vec_info> (vinfo)
2252           && oprnd_info->first_dt == vect_internal_def
2253           && !oprnd_info->any_pattern)
2254         {
2255           /* For BB vectorization, if all defs are the same do not
2256              bother to continue the build along the single-lane
2257              graph but use a splat of the scalar value.  */
2258           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2259           for (j = 1; j < group_size; ++j)
2260             if (oprnd_info->def_stmts[j] != first_def)
2261               break;
2262           if (j == group_size
2263               /* But avoid doing this for loads where we may be
2264                  able to CSE things, unless the stmt is not
2265                  vectorizable.  */
2266               && (!STMT_VINFO_VECTORIZABLE (first_def)
2267                   || !gimple_vuse (first_def->stmt)))
2268             {
2269               if (dump_enabled_p ())
2270                 dump_printf_loc (MSG_NOTE, vect_location,
2271                                  "Using a splat of the uniform operand %G",
2272                                  first_def->stmt);
2273               oprnd_info->first_dt = vect_external_def;
2274             }
2275         }
2276
2277       if (oprnd_info->first_dt == vect_external_def
2278           || oprnd_info->first_dt == vect_constant_def)
2279         {
2280           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2281           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2282           oprnd_info->ops = vNULL;
2283           children.safe_push (invnode);
2284           continue;
2285         }
2286
2287       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2288                                         group_size, &this_max_nunits,
2289                                         matches, limit,
2290                                         &this_tree_size, bst_map)) != NULL)
2291         {
2292           oprnd_info->def_stmts = vNULL;
2293           children.safe_push (child);
2294           continue;
2295         }
2296
2297       /* If the SLP build for operand zero failed and operand zero
2298          and one can be commutated try that for the scalar stmts
2299          that failed the match.  */
2300       if (i == 0
2301           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2302           && matches[0]
2303           /* ???  For COND_EXPRs we can swap the comparison operands
2304              as well as the arms under some constraints.  */
2305           && nops == 2
2306           && oprnds_info[1]->first_dt == vect_internal_def
2307           && is_gimple_assign (stmt_info->stmt)
2308           /* Swapping operands for reductions breaks assumptions later on.  */
2309           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2310           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2311         {
2312           /* See whether we can swap the matching or the non-matching
2313              stmt operands.  */
2314           bool swap_not_matching = true;
2315           do
2316             {
2317               for (j = 0; j < group_size; ++j)
2318                 {
2319                   if (matches[j] != !swap_not_matching)
2320                     continue;
2321                   stmt_vec_info stmt_info = stmts[j];
2322                   /* Verify if we can swap operands of this stmt.  */
2323                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2324                   if (!stmt
2325                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2326                     {
2327                       if (!swap_not_matching)
2328                         goto fail;
2329                       swap_not_matching = false;
2330                       break;
2331                     }
2332                 }
2333             }
2334           while (j != group_size);
2335
2336           /* Swap mismatched definition stmts.  */
2337           if (dump_enabled_p ())
2338             dump_printf_loc (MSG_NOTE, vect_location,
2339                              "Re-trying with swapped operands of stmts ");
2340           for (j = 0; j < group_size; ++j)
2341             if (matches[j] == !swap_not_matching)
2342               {
2343                 std::swap (oprnds_info[0]->def_stmts[j],
2344                            oprnds_info[1]->def_stmts[j]);
2345                 std::swap (oprnds_info[0]->ops[j],
2346                            oprnds_info[1]->ops[j]);
2347                 if (dump_enabled_p ())
2348                   dump_printf (MSG_NOTE, "%d ", j);
2349               }
2350           if (dump_enabled_p ())
2351             dump_printf (MSG_NOTE, "\n");
2352           /* After swapping some operands we lost track whether an
2353              operand has any pattern defs so be conservative here.  */
2354           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2355             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2356           /* And try again with scratch 'matches' ... */
2357           bool *tem = XALLOCAVEC (bool, group_size);
2358           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2359                                             group_size, &this_max_nunits,
2360                                             tem, limit,
2361                                             &this_tree_size, bst_map)) != NULL)
2362             {
2363               oprnd_info->def_stmts = vNULL;
2364               children.safe_push (child);
2365               continue;
2366             }
2367         }
2368 fail:
2369
2370       /* If the SLP build failed and we analyze a basic-block
2371          simply treat nodes we fail to build as externally defined
2372          (and thus build vectors from the scalar defs).
2373          The cost model will reject outright expensive cases.
2374          ???  This doesn't treat cases where permutation ultimatively
2375          fails (or we don't try permutation below).  Ideally we'd
2376          even compute a permutation that will end up with the maximum
2377          SLP tree size...  */
2378       if (is_a <bb_vec_info> (vinfo)
2379           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2380              do extra work to cancel the pattern so the uses see the
2381              scalar version.  */
2382           && !is_pattern_stmt_p (stmt_info)
2383           && !oprnd_info->any_pattern)
2384         {
2385           /* But if there's a leading vector sized set of matching stmts
2386              fail here so we can split the group.  This matches the condition
2387              vect_analyze_slp_instance uses.  */
2388           /* ???  We might want to split here and combine the results to support
2389              multiple vector sizes better.  */
2390           for (j = 0; j < group_size; ++j)
2391             if (!matches[j])
2392               break;
2393           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2394             {
2395               if (dump_enabled_p ())
2396                 dump_printf_loc (MSG_NOTE, vect_location,
2397                                  "Building vector operands from scalars\n");
2398               this_tree_size++;
2399               child = vect_create_new_slp_node (oprnd_info->ops);
2400               children.safe_push (child);
2401               oprnd_info->ops = vNULL;
2402               continue;
2403             }
2404         }
2405
2406       gcc_assert (child == NULL);
2407       FOR_EACH_VEC_ELT (children, j, child)
2408         if (child)
2409           vect_free_slp_tree (child);
2410       vect_free_oprnd_info (oprnds_info);
2411       return NULL;
2412     }
2413
2414   vect_free_oprnd_info (oprnds_info);
2415
2416   /* If we have all children of a child built up from uniform scalars
2417      or does more than one possibly expensive vector construction then
2418      just throw that away, causing it built up from scalars.
2419      The exception is the SLP node for the vector store.  */
2420   if (is_a <bb_vec_info> (vinfo)
2421       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2422       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2423          do extra work to cancel the pattern so the uses see the
2424          scalar version.  */
2425       && !is_pattern_stmt_p (stmt_info))
2426     {
2427       slp_tree child;
2428       unsigned j;
2429       bool all_uniform_p = true;
2430       unsigned n_vector_builds = 0;
2431       FOR_EACH_VEC_ELT (children, j, child)
2432         {
2433           if (!child)
2434             ;
2435           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2436             all_uniform_p = false;
2437           else if (!vect_slp_tree_uniform_p (child))
2438             {
2439               all_uniform_p = false;
2440               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2441                 n_vector_builds++;
2442             }
2443         }
2444       if (all_uniform_p
2445           || n_vector_builds > 1
2446           || (n_vector_builds == children.length ()
2447               && is_a <gphi *> (stmt_info->stmt)))
2448         {
2449           /* Roll back.  */
2450           matches[0] = false;
2451           FOR_EACH_VEC_ELT (children, j, child)
2452             if (child)
2453               vect_free_slp_tree (child);
2454
2455           if (dump_enabled_p ())
2456             dump_printf_loc (MSG_NOTE, vect_location,
2457                              "Building parent vector operands from "
2458                              "scalars instead\n");
2459           return NULL;
2460         }
2461     }
2462
2463   *tree_size += this_tree_size + 1;
2464   *max_nunits = this_max_nunits;
2465
2466   if (two_operators)
2467     {
2468       /* ???  We'd likely want to either cache in bst_map sth like
2469          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2470          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2471          explicit stmts to put in so the keying on 'stmts' doesn't
2472          work (but we have the same issue with nodes that use 'ops').  */
2473       slp_tree one = new _slp_tree;
2474       slp_tree two = new _slp_tree;
2475       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2476       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2477       SLP_TREE_VECTYPE (one) = vectype;
2478       SLP_TREE_VECTYPE (two) = vectype;
2479       SLP_TREE_CHILDREN (one).safe_splice (children);
2480       SLP_TREE_CHILDREN (two).safe_splice (children);
2481       slp_tree child;
2482       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2483         SLP_TREE_REF_COUNT (child)++;
2484
2485       /* Here we record the original defs since this
2486          node represents the final lane configuration.  */
2487       node = vect_create_new_slp_node (node, stmts, 2);
2488       SLP_TREE_VECTYPE (node) = vectype;
2489       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2490       SLP_TREE_CHILDREN (node).quick_push (one);
2491       SLP_TREE_CHILDREN (node).quick_push (two);
2492       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2493       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2494       enum tree_code ocode = ERROR_MARK;
2495       stmt_vec_info ostmt_info;
2496       unsigned j = 0;
2497       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2498         {
2499           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2500           if (gimple_assign_rhs_code (ostmt) != code0)
2501             {
2502               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2503               ocode = gimple_assign_rhs_code (ostmt);
2504               j = i;
2505             }
2506           else
2507             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2508         }
2509       SLP_TREE_CODE (one) = code0;
2510       SLP_TREE_CODE (two) = ocode;
2511       SLP_TREE_LANES (one) = stmts.length ();
2512       SLP_TREE_LANES (two) = stmts.length ();
2513       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2514       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2515       return node;
2516     }
2517
2518   node = vect_create_new_slp_node (node, stmts, nops);
2519   SLP_TREE_VECTYPE (node) = vectype;
2520   SLP_TREE_CHILDREN (node).splice (children);
2521   return node;
2522 }
2523
2524 /* Dump a single SLP tree NODE.  */
2525
2526 static void
2527 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2528                      slp_tree node)
2529 {
2530   unsigned i, j;
2531   slp_tree child;
2532   stmt_vec_info stmt_info;
2533   tree op;
2534
2535   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2536   dump_user_location_t user_loc = loc.get_user_location ();
2537   dump_printf_loc (metadata, user_loc,
2538                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2539                    ", refcnt=%u)",
2540                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2541                    ? " (external)"
2542                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2543                       ? " (constant)"
2544                       : ""), (void *) node,
2545                    estimated_poly_value (node->max_nunits),
2546                                          SLP_TREE_REF_COUNT (node));
2547   if (SLP_TREE_VECTYPE (node))
2548     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2549   dump_printf (metadata, "\n");
2550   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2551     {
2552       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2553         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2554       else
2555         dump_printf_loc (metadata, user_loc, "op template: %G",
2556                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2557     }
2558   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2559     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2560       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2561   else
2562     {
2563       dump_printf_loc (metadata, user_loc, "\t{ ");
2564       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2565         dump_printf (metadata, "%T%s ", op,
2566                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2567       dump_printf (metadata, "}\n");
2568     }
2569   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2570     {
2571       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2572       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2573         dump_printf (dump_kind, " %u", j);
2574       dump_printf (dump_kind, " }\n");
2575     }
2576   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2577     {
2578       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2579       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2580         dump_printf (dump_kind, " %u[%u]",
2581                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2582                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2583       dump_printf (dump_kind, " }\n");
2584     }
2585   if (SLP_TREE_CHILDREN (node).is_empty ())
2586     return;
2587   dump_printf_loc (metadata, user_loc, "\tchildren");
2588   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2589     dump_printf (dump_kind, " %p", (void *)child);
2590   dump_printf (dump_kind, "\n");
2591 }
2592
2593 DEBUG_FUNCTION void
2594 debug (slp_tree node)
2595 {
2596   debug_dump_context ctx;
2597   vect_print_slp_tree (MSG_NOTE,
2598                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2599                        node);
2600 }
2601
2602 /* Recursive helper for the dot producer below.  */
2603
2604 static void
2605 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2606 {
2607   if (visited.add (node))
2608     return;
2609
2610   fprintf (f, "\"%p\" [label=\"", (void *)node);
2611   vect_print_slp_tree (MSG_NOTE,
2612                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2613                        node);
2614   fprintf (f, "\"];\n");
2615
2616
2617   for (slp_tree child : SLP_TREE_CHILDREN (node))
2618     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2619
2620   for (slp_tree child : SLP_TREE_CHILDREN (node))
2621     if (child)
2622       dot_slp_tree (f, child, visited);
2623 }
2624
2625 DEBUG_FUNCTION void
2626 dot_slp_tree (const char *fname, slp_tree node)
2627 {
2628   FILE *f = fopen (fname, "w");
2629   fprintf (f, "digraph {\n");
2630   fflush (f);
2631     {
2632       debug_dump_context ctx (f);
2633       hash_set<slp_tree> visited;
2634       dot_slp_tree (f, node, visited);
2635     }
2636   fflush (f);
2637   fprintf (f, "}\n");
2638   fclose (f);
2639 }
2640
2641 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2642
2643 static void
2644 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2645                       slp_tree node, hash_set<slp_tree> &visited)
2646 {
2647   unsigned i;
2648   slp_tree child;
2649
2650   if (visited.add (node))
2651     return;
2652
2653   vect_print_slp_tree (dump_kind, loc, node);
2654
2655   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2656     if (child)
2657       vect_print_slp_graph (dump_kind, loc, child, visited);
2658 }
2659
2660 static void
2661 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2662                       slp_tree entry)
2663 {
2664   hash_set<slp_tree> visited;
2665   vect_print_slp_graph (dump_kind, loc, entry, visited);
2666 }
2667
2668 /* Mark the tree rooted at NODE with PURE_SLP.  */
2669
2670 static void
2671 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2672 {
2673   int i;
2674   stmt_vec_info stmt_info;
2675   slp_tree child;
2676
2677   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2678     return;
2679
2680   if (visited.add (node))
2681     return;
2682
2683   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2684     STMT_SLP_TYPE (stmt_info) = pure_slp;
2685
2686   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2687     if (child)
2688       vect_mark_slp_stmts (child, visited);
2689 }
2690
2691 static void
2692 vect_mark_slp_stmts (slp_tree node)
2693 {
2694   hash_set<slp_tree> visited;
2695   vect_mark_slp_stmts (node, visited);
2696 }
2697
2698 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2699
2700 static void
2701 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2702 {
2703   int i;
2704   stmt_vec_info stmt_info;
2705   slp_tree child;
2706
2707   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2708     return;
2709
2710   if (visited.add (node))
2711     return;
2712
2713   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2714     {
2715       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2716                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2717       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2718     }
2719
2720   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2721     if (child)
2722       vect_mark_slp_stmts_relevant (child, visited);
2723 }
2724
2725 static void
2726 vect_mark_slp_stmts_relevant (slp_tree node)
2727 {
2728   hash_set<slp_tree> visited;
2729   vect_mark_slp_stmts_relevant (node, visited);
2730 }
2731
2732
2733 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2734
2735 static void
2736 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2737                        hash_set<slp_tree> &visited)
2738 {
2739   if (!node || visited.add (node))
2740     return;
2741
2742   if (SLP_TREE_CHILDREN (node).length () == 0)
2743     {
2744       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2745         return;
2746       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2747       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2748           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2749         loads.safe_push (node);
2750     }
2751   else
2752     {
2753       unsigned i;
2754       slp_tree child;
2755       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2756         vect_gather_slp_loads (loads, child, visited);
2757     }
2758 }
2759
2760
2761 /* Find the last store in SLP INSTANCE.  */
2762
2763 stmt_vec_info
2764 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2765 {
2766   stmt_vec_info last = NULL;
2767   stmt_vec_info stmt_vinfo;
2768
2769   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2770     {
2771       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2772       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2773     }
2774
2775   return last;
2776 }
2777
2778 /* Find the first stmt in NODE.  */
2779
2780 stmt_vec_info
2781 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2782 {
2783   stmt_vec_info first = NULL;
2784   stmt_vec_info stmt_vinfo;
2785
2786   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2787     {
2788       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2789       if (!first
2790           || get_later_stmt (stmt_vinfo, first) == first)
2791         first = stmt_vinfo;
2792     }
2793
2794   return first;
2795 }
2796
2797 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2798    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2799    (also containing the first GROUP1_SIZE stmts, since stores are
2800    consecutive), the second containing the remainder.
2801    Return the first stmt in the second group.  */
2802
2803 static stmt_vec_info
2804 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2805 {
2806   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2807   gcc_assert (group1_size > 0);
2808   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2809   gcc_assert (group2_size > 0);
2810   DR_GROUP_SIZE (first_vinfo) = group1_size;
2811
2812   stmt_vec_info stmt_info = first_vinfo;
2813   for (unsigned i = group1_size; i > 1; i--)
2814     {
2815       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2817     }
2818   /* STMT is now the last element of the first group.  */
2819   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2820   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2821
2822   DR_GROUP_SIZE (group2) = group2_size;
2823   for (stmt_info = group2; stmt_info;
2824        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2825     {
2826       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2827       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2828     }
2829
2830   /* For the second group, the DR_GROUP_GAP is that before the original group,
2831      plus skipping over the first vector.  */
2832   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2833
2834   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2835   DR_GROUP_GAP (first_vinfo) += group2_size;
2836
2837   if (dump_enabled_p ())
2838     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2839                      group1_size, group2_size);
2840
2841   return group2;
2842 }
2843
2844 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2845    statements and a vector of NUNITS elements.  */
2846
2847 static poly_uint64
2848 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2849 {
2850   return exact_div (common_multiple (nunits, group_size), group_size);
2851 }
2852
2853 /* Helper that checks to see if a node is a load node.  */
2854
2855 static inline bool
2856 vect_is_slp_load_node  (slp_tree root)
2857 {
2858   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2859          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2860          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2861 }
2862
2863
2864 /* Helper function of optimize_load_redistribution that performs the operation
2865    recursively.  */
2866
2867 static slp_tree
2868 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2869                                 vec_info *vinfo, unsigned int group_size,
2870                                 hash_map<slp_tree, slp_tree> *load_map,
2871                                 slp_tree root)
2872 {
2873   if (slp_tree *leader = load_map->get (root))
2874     return *leader;
2875
2876   slp_tree node;
2877   unsigned i;
2878
2879   /* For now, we don't know anything about externals so do not do anything.  */
2880   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2881     return NULL;
2882   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2883     {
2884       /* First convert this node into a load node and add it to the leaves
2885          list and flatten the permute from a lane to a load one.  If it's
2886          unneeded it will be elided later.  */
2887       vec<stmt_vec_info> stmts;
2888       stmts.create (SLP_TREE_LANES (root));
2889       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2890       for (unsigned j = 0; j < lane_perm.length (); j++)
2891         {
2892           std::pair<unsigned, unsigned> perm = lane_perm[j];
2893           node = SLP_TREE_CHILDREN (root)[perm.first];
2894
2895           if (!vect_is_slp_load_node (node)
2896               || SLP_TREE_CHILDREN (node).exists ())
2897             {
2898               stmts.release ();
2899               goto next;
2900             }
2901
2902           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2903         }
2904
2905       if (dump_enabled_p ())
2906         dump_printf_loc (MSG_NOTE, vect_location,
2907                          "converting stmts on permute node %p\n",
2908                          (void *) root);
2909
2910       bool *matches = XALLOCAVEC (bool, group_size);
2911       poly_uint64 max_nunits = 1;
2912       unsigned tree_size = 0, limit = 1;
2913       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2914                                   matches, &limit, &tree_size, bst_map);
2915       if (!node)
2916         stmts.release ();
2917
2918       load_map->put (root, node);
2919       return node;
2920     }
2921
2922 next:
2923   load_map->put (root, NULL);
2924
2925   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2926     {
2927       slp_tree value
2928         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2929                                           node);
2930       if (value)
2931         {
2932           SLP_TREE_REF_COUNT (value)++;
2933           SLP_TREE_CHILDREN (root)[i] = value;
2934           /* ???  We know the original leafs of the replaced nodes will
2935              be referenced by bst_map, only the permutes created by
2936              pattern matching are not.  */
2937           if (SLP_TREE_REF_COUNT (node) == 1)
2938             load_map->remove (node);
2939           vect_free_slp_tree (node);
2940         }
2941     }
2942
2943   return NULL;
2944 }
2945
2946 /* Temporary workaround for loads not being CSEd during SLP build.  This
2947    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2948    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2949    same DR such that the final operation is equal to a permuted load.  Such
2950    NODES are then directly converted into LOADS themselves.  The nodes are
2951    CSEd using BST_MAP.  */
2952
2953 static void
2954 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2955                               vec_info *vinfo, unsigned int group_size,
2956                               hash_map<slp_tree, slp_tree> *load_map,
2957                               slp_tree root)
2958 {
2959   slp_tree node;
2960   unsigned i;
2961
2962   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2963     {
2964       slp_tree value
2965         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2966                                           node);
2967       if (value)
2968         {
2969           SLP_TREE_REF_COUNT (value)++;
2970           SLP_TREE_CHILDREN (root)[i] = value;
2971           /* ???  We know the original leafs of the replaced nodes will
2972              be referenced by bst_map, only the permutes created by
2973              pattern matching are not.  */
2974           if (SLP_TREE_REF_COUNT (node) == 1)
2975             load_map->remove (node);
2976           vect_free_slp_tree (node);
2977         }
2978     }
2979 }
2980
2981 /* Helper function of vect_match_slp_patterns.
2982
2983    Attempts to match patterns against the slp tree rooted in REF_NODE using
2984    VINFO.  Patterns are matched in post-order traversal.
2985
2986    If matching is successful the value in REF_NODE is updated and returned, if
2987    not then it is returned unchanged.  */
2988
2989 static bool
2990 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2991                            slp_tree_to_load_perm_map_t *perm_cache,
2992                            slp_compat_nodes_map_t *compat_cache,
2993                            hash_set<slp_tree> *visited)
2994 {
2995   unsigned i;
2996   slp_tree node = *ref_node;
2997   bool found_p = false;
2998   if (!node || visited->add (node))
2999     return false;
3000
3001   slp_tree child;
3002   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3003     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3004                                           vinfo, perm_cache, compat_cache,
3005                                           visited);
3006
3007   for (unsigned x = 0; x < num__slp_patterns; x++)
3008     {
3009       vect_pattern *pattern
3010         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3011       if (pattern)
3012         {
3013           pattern->build (vinfo);
3014           delete pattern;
3015           found_p = true;
3016         }
3017     }
3018
3019   return found_p;
3020 }
3021
3022 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3023    vec_info VINFO.
3024
3025    The modified tree is returned.  Patterns are tried in order and multiple
3026    patterns may match.  */
3027
3028 static bool
3029 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3030                          hash_set<slp_tree> *visited,
3031                          slp_tree_to_load_perm_map_t *perm_cache,
3032                          slp_compat_nodes_map_t *compat_cache)
3033 {
3034   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3035   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3036
3037   if (dump_enabled_p ())
3038     dump_printf_loc (MSG_NOTE, vect_location,
3039                      "Analyzing SLP tree %p for patterns\n",
3040                      (void *) SLP_INSTANCE_TREE (instance));
3041
3042   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3043                                     visited);
3044 }
3045
3046 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3047    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3048    Return true if we could use IFN_STORE_LANES instead and if that appears
3049    to be the better approach.  */
3050
3051 static bool
3052 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3053                                unsigned int group_size,
3054                                unsigned int new_group_size)
3055 {
3056   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3057   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3058   if (!vectype)
3059     return false;
3060   /* Allow the split if one of the two new groups would operate on full
3061      vectors *within* rather than across one scalar loop iteration.
3062      This is purely a heuristic, but it should work well for group
3063      sizes of 3 and 4, where the possible splits are:
3064
3065        3->2+1:  OK if the vector has exactly two elements
3066        4->2+2:  Likewise
3067        4->3+1:  Less clear-cut.  */
3068   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3069       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3070     return false;
3071   return vect_store_lanes_supported (vectype, group_size, false);
3072 }
3073
3074 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3075    vect_build_slp_tree to build a tree of packed stmts if possible.
3076    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3077
3078 static bool
3079 vect_analyze_slp_instance (vec_info *vinfo,
3080                            scalar_stmts_to_slp_tree_map_t *bst_map,
3081                            stmt_vec_info stmt_info, slp_instance_kind kind,
3082                            unsigned max_tree_size, unsigned *limit);
3083
3084 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3085    of KIND.  Return true if successful.  */
3086
3087 static bool
3088 vect_build_slp_instance (vec_info *vinfo,
3089                          slp_instance_kind kind,
3090                          vec<stmt_vec_info> &scalar_stmts,
3091                          vec<stmt_vec_info> &root_stmt_infos,
3092                          unsigned max_tree_size, unsigned *limit,
3093                          scalar_stmts_to_slp_tree_map_t *bst_map,
3094                          /* ???  We need stmt_info for group splitting.  */
3095                          stmt_vec_info stmt_info_)
3096 {
3097   if (dump_enabled_p ())
3098     {
3099       dump_printf_loc (MSG_NOTE, vect_location,
3100                        "Starting SLP discovery for\n");
3101       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3102         dump_printf_loc (MSG_NOTE, vect_location,
3103                          "  %G", scalar_stmts[i]->stmt);
3104     }
3105
3106   /* Build the tree for the SLP instance.  */
3107   unsigned int group_size = scalar_stmts.length ();
3108   bool *matches = XALLOCAVEC (bool, group_size);
3109   poly_uint64 max_nunits = 1;
3110   unsigned tree_size = 0;
3111   unsigned i;
3112   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3113                                        &max_nunits, matches, limit,
3114                                        &tree_size, bst_map);
3115   if (node != NULL)
3116     {
3117       /* Calculate the unrolling factor based on the smallest type.  */
3118       poly_uint64 unrolling_factor
3119         = calculate_unrolling_factor (max_nunits, group_size);
3120
3121       if (maybe_ne (unrolling_factor, 1U)
3122           && is_a <bb_vec_info> (vinfo))
3123         {
3124           unsigned HOST_WIDE_INT const_max_nunits;
3125           if (!max_nunits.is_constant (&const_max_nunits)
3126               || const_max_nunits > group_size)
3127             {
3128               if (dump_enabled_p ())
3129                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130                                  "Build SLP failed: store group "
3131                                  "size not a multiple of the vector size "
3132                                  "in basic block SLP\n");
3133               vect_free_slp_tree (node);
3134               return false;
3135             }
3136           /* Fatal mismatch.  */
3137           if (dump_enabled_p ())
3138             dump_printf_loc (MSG_NOTE, vect_location,
3139                              "SLP discovery succeeded but node needs "
3140                              "splitting\n");
3141           memset (matches, true, group_size);
3142           matches[group_size / const_max_nunits * const_max_nunits] = false;
3143           vect_free_slp_tree (node);
3144         }
3145       else
3146         {
3147           /* Create a new SLP instance.  */
3148           slp_instance new_instance = XNEW (class _slp_instance);
3149           SLP_INSTANCE_TREE (new_instance) = node;
3150           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3151           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3152           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3153           SLP_INSTANCE_KIND (new_instance) = kind;
3154           new_instance->reduc_phis = NULL;
3155           new_instance->cost_vec = vNULL;
3156           new_instance->subgraph_entries = vNULL;
3157
3158           if (dump_enabled_p ())
3159             dump_printf_loc (MSG_NOTE, vect_location,
3160                              "SLP size %u vs. limit %u.\n",
3161                              tree_size, max_tree_size);
3162
3163           /* Fixup SLP reduction chains.  */
3164           if (kind == slp_inst_kind_reduc_chain)
3165             {
3166               /* If this is a reduction chain with a conversion in front
3167                  amend the SLP tree with a node for that.  */
3168               gimple *scalar_def
3169                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3170               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3171                 {
3172                   /* Get at the conversion stmt - we know it's the single use
3173                      of the last stmt of the reduction chain.  */
3174                   use_operand_p use_p;
3175                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3176                                            &use_p, &scalar_def);
3177                   gcc_assert (r);
3178                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3179                   next_info = vect_stmt_to_vectorize (next_info);
3180                   scalar_stmts = vNULL;
3181                   scalar_stmts.create (group_size);
3182                   for (unsigned i = 0; i < group_size; ++i)
3183                     scalar_stmts.quick_push (next_info);
3184                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3185                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3186                   SLP_TREE_CHILDREN (conv).quick_push (node);
3187                   SLP_INSTANCE_TREE (new_instance) = conv;
3188                   /* We also have to fake this conversion stmt as SLP reduction
3189                      group so we don't have to mess with too much code
3190                      elsewhere.  */
3191                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3192                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3193                 }
3194               /* Fill the backedge child of the PHI SLP node.  The
3195                  general matching code cannot find it because the
3196                  scalar code does not reflect how we vectorize the
3197                  reduction.  */
3198               use_operand_p use_p;
3199               imm_use_iterator imm_iter;
3200               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3201               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3202                                      gimple_get_lhs (scalar_def))
3203                 /* There are exactly two non-debug uses, the reduction
3204                    PHI and the loop-closed PHI node.  */
3205                 if (!is_gimple_debug (USE_STMT (use_p))
3206                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3207                   {
3208                     auto_vec<stmt_vec_info, 64> phis (group_size);
3209                     stmt_vec_info phi_info
3210                       = vinfo->lookup_stmt (USE_STMT (use_p));
3211                     for (unsigned i = 0; i < group_size; ++i)
3212                       phis.quick_push (phi_info);
3213                     slp_tree *phi_node = bst_map->get (phis);
3214                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3215                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3216                       = SLP_INSTANCE_TREE (new_instance);
3217                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3218                   }
3219             }
3220
3221           vinfo->slp_instances.safe_push (new_instance);
3222
3223           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3224              the number of scalar stmts in the root in a few places.
3225              Verify that assumption holds.  */
3226           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3227                         .length () == group_size);
3228
3229           if (dump_enabled_p ())
3230             {
3231               dump_printf_loc (MSG_NOTE, vect_location,
3232                                "Final SLP tree for instance %p:\n",
3233                                (void *) new_instance);
3234               vect_print_slp_graph (MSG_NOTE, vect_location,
3235                                     SLP_INSTANCE_TREE (new_instance));
3236             }
3237
3238           return true;
3239         }
3240     }
3241   else
3242     {
3243       /* Failed to SLP.  */
3244       /* Free the allocated memory.  */
3245       scalar_stmts.release ();
3246     }
3247
3248   stmt_vec_info stmt_info = stmt_info_;
3249   /* Try to break the group up into pieces.  */
3250   if (kind == slp_inst_kind_store)
3251     {
3252       /* ???  We could delay all the actual splitting of store-groups
3253          until after SLP discovery of the original group completed.
3254          Then we can recurse to vect_build_slp_instance directly.  */
3255       for (i = 0; i < group_size; i++)
3256         if (!matches[i])
3257           break;
3258
3259       /* For basic block SLP, try to break the group up into multiples of
3260          a vector size.  */
3261       if (is_a <bb_vec_info> (vinfo)
3262           && (i > 1 && i < group_size))
3263         {
3264           tree scalar_type
3265             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3266           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3267                                                       1 << floor_log2 (i));
3268           unsigned HOST_WIDE_INT const_nunits;
3269           if (vectype
3270               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3271             {
3272               /* Split into two groups at the first vector boundary.  */
3273               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3274               unsigned group1_size = i & ~(const_nunits - 1);
3275
3276               if (dump_enabled_p ())
3277                 dump_printf_loc (MSG_NOTE, vect_location,
3278                                  "Splitting SLP group at stmt %u\n", i);
3279               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3280                                                                group1_size);
3281               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3282                                                     kind, max_tree_size,
3283                                                     limit);
3284               /* Split the rest at the failure point and possibly
3285                  re-analyze the remaining matching part if it has
3286                  at least two lanes.  */
3287               if (group1_size < i
3288                   && (i + 1 < group_size
3289                       || i - group1_size > 1))
3290                 {
3291                   stmt_vec_info rest2 = rest;
3292                   rest = vect_split_slp_store_group (rest, i - group1_size);
3293                   if (i - group1_size > 1)
3294                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3295                                                       kind, max_tree_size,
3296                                                       limit);
3297                 }
3298               /* Re-analyze the non-matching tail if it has at least
3299                  two lanes.  */
3300               if (i + 1 < group_size)
3301                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3302                                                   rest, kind, max_tree_size,
3303                                                   limit);
3304               return res;
3305             }
3306         }
3307
3308       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3309       if (is_a <loop_vec_info> (vinfo)
3310           && (i > 1 && i < group_size)
3311           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3312         {
3313           unsigned group1_size = i;
3314
3315           if (dump_enabled_p ())
3316             dump_printf_loc (MSG_NOTE, vect_location,
3317                              "Splitting SLP group at stmt %u\n", i);
3318
3319           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3320                                                            group1_size);
3321           /* Loop vectorization cannot handle gaps in stores, make sure
3322              the split group appears as strided.  */
3323           STMT_VINFO_STRIDED_P (rest) = 1;
3324           DR_GROUP_GAP (rest) = 0;
3325           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3326           DR_GROUP_GAP (stmt_info) = 0;
3327
3328           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3329                                                 kind, max_tree_size, limit);
3330           if (i + 1 < group_size)
3331             res |= vect_analyze_slp_instance (vinfo, bst_map,
3332                                               rest, kind, max_tree_size, limit);
3333
3334           return res;
3335         }
3336
3337       /* Even though the first vector did not all match, we might be able to SLP
3338          (some) of the remainder.  FORNOW ignore this possibility.  */
3339     }
3340
3341   /* Failed to SLP.  */
3342   if (dump_enabled_p ())
3343     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3344   return false;
3345 }
3346
3347
3348 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3349    vect_build_slp_tree to build a tree of packed stmts if possible.
3350    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3351
3352 static bool
3353 vect_analyze_slp_instance (vec_info *vinfo,
3354                            scalar_stmts_to_slp_tree_map_t *bst_map,
3355                            stmt_vec_info stmt_info,
3356                            slp_instance_kind kind,
3357                            unsigned max_tree_size, unsigned *limit)
3358 {
3359   unsigned int i;
3360   vec<stmt_vec_info> scalar_stmts;
3361
3362   if (is_a <bb_vec_info> (vinfo))
3363     vect_location = stmt_info->stmt;
3364
3365   stmt_vec_info next_info = stmt_info;
3366   if (kind == slp_inst_kind_store)
3367     {
3368       /* Collect the stores and store them in scalar_stmts.  */
3369       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3370       while (next_info)
3371         {
3372           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3373           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3374         }
3375     }
3376   else if (kind == slp_inst_kind_reduc_chain)
3377     {
3378       /* Collect the reduction stmts and store them in scalar_stmts.  */
3379       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3380       while (next_info)
3381         {
3382           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3383           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3384         }
3385       /* Mark the first element of the reduction chain as reduction to properly
3386          transform the node.  In the reduction analysis phase only the last
3387          element of the chain is marked as reduction.  */
3388       STMT_VINFO_DEF_TYPE (stmt_info)
3389         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3390       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3391         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3392     }
3393   else if (kind == slp_inst_kind_ctor)
3394     {
3395       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3396       tree val;
3397       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3398       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3399         {
3400           stmt_vec_info def_info = vinfo->lookup_def (val);
3401           def_info = vect_stmt_to_vectorize (def_info);
3402           scalar_stmts.quick_push (def_info);
3403         }
3404       if (dump_enabled_p ())
3405         dump_printf_loc (MSG_NOTE, vect_location,
3406                          "Analyzing vectorizable constructor: %G\n",
3407                          stmt_info->stmt);
3408     }
3409   else if (kind == slp_inst_kind_reduc_group)
3410     {
3411       /* Collect reduction statements.  */
3412       const vec<stmt_vec_info> &reductions
3413         = as_a <loop_vec_info> (vinfo)->reductions;
3414       scalar_stmts.create (reductions.length ());
3415       for (i = 0; reductions.iterate (i, &next_info); i++)
3416         if ((STMT_VINFO_RELEVANT_P (next_info)
3417              || STMT_VINFO_LIVE_P (next_info))
3418             /* ???  Make sure we didn't skip a conversion around a reduction
3419                path.  In that case we'd have to reverse engineer that conversion
3420                stmt following the chain using reduc_idx and from the PHI
3421                using reduc_def.  */
3422             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3423           scalar_stmts.quick_push (next_info);
3424       /* If less than two were relevant/live there's nothing to SLP.  */
3425       if (scalar_stmts.length () < 2)
3426         return false;
3427     }
3428   else
3429     gcc_unreachable ();
3430
3431   vec<stmt_vec_info> roots = vNULL;
3432   if (kind == slp_inst_kind_ctor)
3433     {
3434       roots.create (1);
3435       roots.quick_push (stmt_info);
3436     }
3437   /* Build the tree for the SLP instance.  */
3438   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3439                                       roots,
3440                                       max_tree_size, limit, bst_map,
3441                                       kind == slp_inst_kind_store
3442                                       ? stmt_info : NULL);
3443   if (!res)
3444     roots.release ();
3445
3446   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3447      where we should do store group splitting.  */
3448
3449   return res;
3450 }
3451
3452 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3453    trees of packed scalar stmts if SLP is possible.  */
3454
3455 opt_result
3456 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3457 {
3458   unsigned int i;
3459   stmt_vec_info first_element;
3460   slp_instance instance;
3461
3462   DUMP_VECT_SCOPE ("vect_analyze_slp");
3463
3464   unsigned limit = max_tree_size;
3465
3466   scalar_stmts_to_slp_tree_map_t *bst_map
3467     = new scalar_stmts_to_slp_tree_map_t ();
3468
3469   /* Find SLP sequences starting from groups of grouped stores.  */
3470   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3471     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3472                                STMT_VINFO_GROUPED_ACCESS (first_element)
3473                                ? slp_inst_kind_store : slp_inst_kind_ctor,
3474                                max_tree_size, &limit);
3475
3476   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3477     {
3478       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3479         {
3480           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3481           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3482                                        bb_vinfo->roots[i].stmts,
3483                                        bb_vinfo->roots[i].roots,
3484                                        max_tree_size, &limit, bst_map, NULL))
3485             {
3486               bb_vinfo->roots[i].stmts = vNULL;
3487               bb_vinfo->roots[i].roots = vNULL;
3488             }
3489         }
3490     }
3491
3492   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3493     {
3494       /* Find SLP sequences starting from reduction chains.  */
3495       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3496         if (! STMT_VINFO_RELEVANT_P (first_element)
3497             && ! STMT_VINFO_LIVE_P (first_element))
3498           ;
3499         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3500                                               slp_inst_kind_reduc_chain,
3501                                               max_tree_size, &limit))
3502           {
3503             /* Dissolve reduction chain group.  */
3504             stmt_vec_info vinfo = first_element;
3505             stmt_vec_info last = NULL;
3506             while (vinfo)
3507               {
3508                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3509                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3510                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3511                 last = vinfo;
3512                 vinfo = next;
3513               }
3514             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3515             /* It can be still vectorized as part of an SLP reduction.  */
3516             loop_vinfo->reductions.safe_push (last);
3517           }
3518
3519       /* Find SLP sequences starting from groups of reductions.  */
3520       if (loop_vinfo->reductions.length () > 1)
3521         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3522                                    slp_inst_kind_reduc_group, max_tree_size,
3523                                    &limit);
3524     }
3525
3526   hash_set<slp_tree> visited_patterns;
3527   slp_tree_to_load_perm_map_t perm_cache;
3528   slp_compat_nodes_map_t compat_cache;
3529
3530   /* See if any patterns can be found in the SLP tree.  */
3531   bool pattern_found = false;
3532   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3533     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3534                                               &visited_patterns, &perm_cache,
3535                                               &compat_cache);
3536
3537   /* If any were found optimize permutations of loads.  */
3538   if (pattern_found)
3539     {
3540       hash_map<slp_tree, slp_tree> load_map;
3541       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3542         {
3543           slp_tree root = SLP_INSTANCE_TREE (instance);
3544           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3545                                         &load_map, root);
3546         }
3547     }
3548
3549
3550
3551   /* The map keeps a reference on SLP nodes built, release that.  */
3552   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3553        it != bst_map->end (); ++it)
3554     if ((*it).second)
3555       vect_free_slp_tree ((*it).second);
3556   delete bst_map;
3557
3558   if (pattern_found && dump_enabled_p ())
3559     {
3560       dump_printf_loc (MSG_NOTE, vect_location,
3561                        "Pattern matched SLP tree\n");
3562       hash_set<slp_tree> visited;
3563       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3564         vect_print_slp_graph (MSG_NOTE, vect_location,
3565                               SLP_INSTANCE_TREE (instance), visited);
3566     }
3567
3568   return opt_result::success ();
3569 }
3570
3571 /* Estimates the cost of inserting layout changes into the SLP graph.
3572    It can also say that the insertion is impossible.  */
3573
3574 struct slpg_layout_cost
3575 {
3576   slpg_layout_cost () = default;
3577   slpg_layout_cost (sreal, bool);
3578
3579   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3580   bool is_possible () const { return depth != sreal::max (); }
3581
3582   bool operator== (const slpg_layout_cost &) const;
3583   bool operator!= (const slpg_layout_cost &) const;
3584
3585   bool is_better_than (const slpg_layout_cost &, bool) const;
3586
3587   void add_parallel_cost (const slpg_layout_cost &);
3588   void add_serial_cost (const slpg_layout_cost &);
3589   void split (unsigned int);
3590
3591   /* The longest sequence of layout changes needed during any traversal
3592      of the partition dag, weighted by execution frequency.
3593
3594      This is the most important metric when optimizing for speed, since
3595      it helps to ensure that we keep the number of operations on
3596      critical paths to a minimum.  */
3597   sreal depth = 0;
3598
3599   /* An estimate of the total number of operations needed.  It is weighted by
3600      execution frequency when optimizing for speed but not when optimizing for
3601      size.  In order to avoid double-counting, a node with a fanout of N will
3602      distribute 1/N of its total cost to each successor.
3603
3604      This is the most important metric when optimizing for size, since
3605      it helps to keep the total number of operations to a minimum,  */
3606   sreal total = 0;
3607 };
3608
3609 /* Construct costs for a node with weight WEIGHT.  A higher weight
3610    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3611    optimizing for size rather than speed.  */
3612
3613 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3614   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3615 {
3616 }
3617
3618 bool
3619 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3620 {
3621   return depth == other.depth && total == other.total;
3622 }
3623
3624 bool
3625 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3626 {
3627   return !operator== (other);
3628 }
3629
3630 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3631    true if we are optimizing for size rather than speed.  */
3632
3633 bool
3634 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3635                                   bool is_for_size) const
3636 {
3637   if (is_for_size)
3638     {
3639       if (total != other.total)
3640         return total < other.total;
3641       return depth < other.depth;
3642     }
3643   else
3644     {
3645       if (depth != other.depth)
3646         return depth < other.depth;
3647       return total < other.total;
3648     }
3649 }
3650
3651 /* Increase the costs to account for something with cost INPUT_COST
3652    happening in parallel with the current costs.  */
3653
3654 void
3655 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3656 {
3657   depth = std::max (depth, input_cost.depth);
3658   total += input_cost.total;
3659 }
3660
3661 /* Increase the costs to account for something with cost INPUT_COST
3662    happening in series with the current costs.  */
3663
3664 void
3665 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3666 {
3667   depth += other.depth;
3668   total += other.total;
3669 }
3670
3671 /* Split the total cost among TIMES successors or predecessors.  */
3672
3673 void
3674 slpg_layout_cost::split (unsigned int times)
3675 {
3676   if (times > 1)
3677     total /= times;
3678 }
3679
3680 /* Information about one node in the SLP graph, for use during
3681    vect_optimize_slp_pass.  */
3682
3683 struct slpg_vertex
3684 {
3685   slpg_vertex (slp_tree node_) : node (node_) {}
3686
3687   /* The node itself.  */
3688   slp_tree node;
3689
3690   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3691      partitions are flexible; they can have whichever layout consumers
3692      want them to have.  */
3693   int partition = -1;
3694
3695   /* The number of nodes that directly use the result of this one
3696      (i.e. the number of nodes that count this one as a child).  */
3697   unsigned int out_degree = 0;
3698
3699   /* The execution frequency of the node.  */
3700   sreal weight = 0;
3701
3702   /* The total execution frequency of all nodes that directly use the
3703      result of this one.  */
3704   sreal out_weight = 0;
3705 };
3706
3707 /* Information about one partition of the SLP graph, for use during
3708    vect_optimize_slp_pass.  */
3709
3710 struct slpg_partition_info
3711 {
3712   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3713      of m_partitioned_nodes.  */
3714   unsigned int node_begin = 0;
3715   unsigned int node_end = 0;
3716
3717   /* Which layout we've chosen to use for this partition, or -1 if
3718      we haven't picked one yet.  */
3719   int layout = -1;
3720
3721   /* The number of predecessors and successors in the partition dag.
3722      The predecessors always have lower partition numbers and the
3723      successors always have higher partition numbers.
3724
3725      Note that the directions of these edges are not necessarily the
3726      same as in the data flow graph.  For example, if an SCC has separate
3727      partitions for an inner loop and an outer loop, the inner loop's
3728      partition will have at least two incoming edges from the outer loop's
3729      partition: one for a live-in value and one for a live-out value.
3730      In data flow terms, one of these edges would also be from the outer loop
3731      to the inner loop, but the other would be in the opposite direction.  */
3732   unsigned int in_degree = 0;
3733   unsigned int out_degree = 0;
3734 };
3735
3736 /* Information about the costs of using a particular layout for a
3737    particular partition.  It can also say that the combination is
3738    impossible.  */
3739
3740 struct slpg_partition_layout_costs
3741 {
3742   bool is_possible () const { return internal_cost.is_possible (); }
3743   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3744
3745   /* The costs inherited from predecessor partitions.  */
3746   slpg_layout_cost in_cost;
3747
3748   /* The inherent cost of the layout within the node itself.  For example,
3749      this is nonzero for a load if choosing a particular layout would require
3750      the load to permute the loaded elements.  It is nonzero for a
3751      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3752      to full-vector moves.  */
3753   slpg_layout_cost internal_cost;
3754
3755   /* The costs inherited from successor partitions.  */
3756   slpg_layout_cost out_cost;
3757 };
3758
3759 /* This class tries to optimize the layout of vectors in order to avoid
3760    unnecessary shuffling.  At the moment, the set of possible layouts are
3761    restricted to bijective permutations.
3762
3763    The goal of the pass depends on whether we're optimizing for size or
3764    for speed.  When optimizing for size, the goal is to reduce the overall
3765    number of layout changes (including layout changes implied by things
3766    like load permutations).  When optimizing for speed, the goal is to
3767    reduce the maximum latency attributable to layout changes on any
3768    non-cyclical path through the data flow graph.
3769
3770    For example, when optimizing a loop nest for speed, we will prefer
3771    to make layout changes outside of a loop rather than inside of a loop,
3772    and will prefer to make layout changes in parallel rather than serially,
3773    even if that increases the overall number of layout changes.
3774
3775    The high-level procedure is:
3776
3777    (1) Build a graph in which edges go from uses (parents) to definitions
3778        (children).
3779
3780    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3781
3782    (3) When optimizing for speed, partition the nodes in each SCC based
3783        on their containing cfg loop.  When optimizing for size, treat
3784        each SCC as a single partition.
3785
3786        This gives us a dag of partitions.  The goal is now to assign a
3787        layout to each partition.
3788
3789    (4) Construct a set of vector layouts that are worth considering.
3790        Record which nodes must keep their current layout.
3791
3792    (5) Perform a forward walk over the partition dag (from loads to stores)
3793        accumulating the "forward" cost of using each layout.  When visiting
3794        each partition, assign a tentative choice of layout to the partition
3795        and use that choice when calculating the cost of using a different
3796        layout in successor partitions.
3797
3798    (6) Perform a backward walk over the partition dag (from stores to loads),
3799        accumulating the "backward" cost of using each layout.  When visiting
3800        each partition, make a final choice of layout for that partition based
3801        on the accumulated forward costs (from (5)) and backward costs
3802        (from (6)).
3803
3804    (7) Apply the chosen layouts to the SLP graph.
3805
3806    For example, consider the SLP statements:
3807
3808    S1:      a_1 = load
3809        loop:
3810    S2:      a_2 = PHI<a_1, a_3>
3811    S3:      b_1 = load
3812    S4:      a_3 = a_2 + b_1
3813        exit:
3814    S5:      a_4 = PHI<a_3>
3815    S6:      store a_4
3816
3817    S2 and S4 form an SCC and are part of the same loop.  Every other
3818    statement is in a singleton SCC.  In this example there is a one-to-one
3819    mapping between SCCs and partitions and the partition dag looks like this;
3820
3821         S1     S3
3822          \     /
3823           S2+S4
3824             |
3825            S5
3826             |
3827            S6
3828
3829    S2, S3 and S4 will have a higher execution frequency than the other
3830    statements, so when optimizing for speed, the goal is to avoid any
3831    layout changes:
3832
3833    - within S3
3834    - within S2+S4
3835    - on the S3->S2+S4 edge
3836
3837    For example, if S3 was originally a reversing load, the goal of the
3838    pass is to make it an unreversed load and change the layout on the
3839    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3840    on S1->S2+S4 and S5->S6 would also be acceptable.)
3841
3842    The difference between SCCs and partitions becomes important if we
3843    add an outer loop:
3844
3845    S1:      a_1 = ...
3846        loop1:
3847    S2:      a_2 = PHI<a_1, a_6>
3848    S3:      b_1 = load
3849    S4:      a_3 = a_2 + b_1
3850        loop2:
3851    S5:      a_4 = PHI<a_3, a_5>
3852    S6:      c_1 = load
3853    S7:      a_5 = a_4 + c_1
3854        exit2:
3855    S8:      a_6 = PHI<a_5>
3856    S9:      store a_6
3857        exit1:
3858
3859    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3860    for speed, we usually do not want restrictions in the outer loop to "infect"
3861    the decision for the inner loop.  For example, if an outer-loop node
3862    in the SCC contains a statement with a fixed layout, that should not
3863    prevent the inner loop from using a different layout.  Conversely,
3864    the inner loop should not dictate a layout to the outer loop: if the
3865    outer loop does a lot of computation, then it may not be efficient to
3866    do all of that computation in the inner loop's preferred layout.
3867
3868    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3869    and S5+S7 (inner).  We also try to arrange partitions so that:
3870
3871    - the partition for an outer loop comes before the partition for
3872      an inner loop
3873
3874    - if a sibling loop A dominates a sibling loop B, A's partition
3875      comes before B's
3876
3877    This gives the following partition dag for the example above:
3878
3879         S1        S3
3880          \        /
3881           S2+S4+S8   S6
3882            |   \\    /
3883            |    S5+S7
3884            |
3885           S9
3886
3887    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3888    one for a reversal of the edge S7->S8.
3889
3890    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3891    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3892    preferred layout against the cost of changing the layout on entry to the
3893    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3894
3895    Although this works well when optimizing for speed, it has the downside
3896    when optimizing for size that the choice of layout for S5+S7 is completely
3897    independent of S9, which lessens the chance of reducing the overall number
3898    of permutations.  We therefore do not partition SCCs when optimizing
3899    for size.
3900
3901    To give a concrete example of the difference between optimizing
3902    for size and speed, consider:
3903
3904    a[0] = (b[1] << c[3]) - d[1];
3905    a[1] = (b[0] << c[2]) - d[0];
3906    a[2] = (b[3] << c[1]) - d[3];
3907    a[3] = (b[2] << c[0]) - d[2];
3908
3909    There are three different layouts here: one for a, one for b and d,
3910    and one for c.  When optimizing for speed it is better to permute each
3911    of b, c and d into the order required by a, since those permutations
3912    happen in parallel.  But when optimizing for size, it is better to:
3913
3914    - permute c into the same order as b
3915    - do the arithmetic
3916    - permute the result into the order required by a
3917
3918    This gives 2 permutations rather than 3.  */
3919
3920 class vect_optimize_slp_pass
3921 {
3922 public:
3923   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3924   void run ();
3925
3926 private:
3927   /* Graph building.  */
3928   struct loop *containing_loop (slp_tree);
3929   bool is_cfg_latch_edge (graph_edge *);
3930   void build_vertices (hash_set<slp_tree> &, slp_tree);
3931   void build_vertices ();
3932   void build_graph ();
3933
3934   /* Partitioning.  */
3935   void create_partitions ();
3936   template<typename T> void for_each_partition_edge (unsigned int, T);
3937
3938   /* Layout selection.  */
3939   bool is_compatible_layout (slp_tree, unsigned int);
3940   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3941   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3942                                                        unsigned int);
3943   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3944                                int, unsigned int);
3945   int internal_node_cost (slp_tree, int, unsigned int);
3946   void start_choosing_layouts ();
3947
3948   /* Cost propagation.  */
3949   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3950                                      unsigned int, unsigned int);
3951   slpg_layout_cost total_in_cost (unsigned int);
3952   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3953   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3954   void forward_pass ();
3955   void backward_pass ();
3956
3957   /* Rematerialization.  */
3958   slp_tree get_result_with_layout (slp_tree, unsigned int);
3959   void materialize ();
3960
3961   /* Clean-up.  */
3962   void remove_redundant_permutations ();
3963
3964   void dump ();
3965
3966   vec_info *m_vinfo;
3967
3968   /* True if we should optimize the graph for size, false if we should
3969      optimize it for speed.  (It wouldn't be easy to make this decision
3970      more locally.)  */
3971   bool m_optimize_size;
3972
3973   /* A graph of all SLP nodes, with edges leading from uses to definitions.
3974      In other words, a node's predecessors are its slp_tree parents and
3975      a node's successors are its slp_tree children.  */
3976   graph *m_slpg = nullptr;
3977
3978   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
3979   auto_vec<slpg_vertex> m_vertices;
3980
3981   /* The list of all leaves of M_SLPG. such as external definitions, constants,
3982      and loads.  */
3983   auto_vec<int> m_leafs;
3984
3985   /* This array has one entry for every vector layout that we're considering.
3986      Element 0 is null and indicates "no change".  Other entries describe
3987      permutations that are inherent in the current graph and that we would
3988      like to reverse if possible.
3989
3990      For example, a permutation { 1, 2, 3, 0 } means that something has
3991      effectively been permuted in that way, such as a load group
3992      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3993      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3994      in order to put things "back" in order.  */
3995   auto_vec<vec<unsigned> > m_perms;
3996
3997   /* A partitioning of the nodes for which a layout must be chosen.
3998      Each partition represents an <SCC, cfg loop> pair; that is,
3999      nodes in different SCCs belong to different partitions, and nodes
4000      within an SCC can be further partitioned according to a containing
4001      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4002
4003      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4004        from leaves (such as loads) to roots (such as stores).
4005
4006      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4007   auto_vec<slpg_partition_info> m_partitions;
4008
4009   /* The list of all nodes for which a layout must be chosen.  Nodes for
4010      partition P come before the nodes for partition P+1.  Nodes within a
4011      partition are in reverse postorder.  */
4012   auto_vec<unsigned int> m_partitioned_nodes;
4013
4014   /* Index P * num-layouts + L contains the cost of using layout L
4015      for partition P.  */
4016   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4017
4018   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4019      original output of node N adjusted to have layout L.  */
4020   auto_vec<slp_tree> m_node_layouts;
4021 };
4022
4023 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4024    Also record whether we should optimize anything for speed rather
4025    than size.  */
4026
4027 void
4028 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4029                                         slp_tree node)
4030 {
4031   unsigned i;
4032   slp_tree child;
4033
4034   if (visited.add (node))
4035     return;
4036
4037   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4038     {
4039       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4040       if (optimize_bb_for_speed_p (bb))
4041         m_optimize_size = false;
4042     }
4043
4044   node->vertex = m_vertices.length ();
4045   m_vertices.safe_push (slpg_vertex (node));
4046
4047   bool leaf = true;
4048   bool force_leaf = false;
4049   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4050     if (child)
4051       {
4052         leaf = false;
4053         build_vertices (visited, child);
4054       }
4055     else
4056       force_leaf = true;
4057   /* Since SLP discovery works along use-def edges all cycles have an
4058      entry - but there's the exception of cycles where we do not handle
4059      the entry explicitely (but with a NULL SLP node), like some reductions
4060      and inductions.  Force those SLP PHIs to act as leafs to make them
4061      backwards reachable.  */
4062   if (leaf || force_leaf)
4063     m_leafs.safe_push (node->vertex);
4064 }
4065
4066 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4067
4068 void
4069 vect_optimize_slp_pass::build_vertices ()
4070 {
4071   hash_set<slp_tree> visited;
4072   unsigned i;
4073   slp_instance instance;
4074   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4075     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4076 }
4077
4078 /* Apply (reverse) bijectite PERM to VEC.  */
4079
4080 template <class T>
4081 static void
4082 vect_slp_permute (vec<unsigned> perm,
4083                   vec<T> &vec, bool reverse)
4084 {
4085   auto_vec<T, 64> saved;
4086   saved.create (vec.length ());
4087   for (unsigned i = 0; i < vec.length (); ++i)
4088     saved.quick_push (vec[i]);
4089
4090   if (reverse)
4091     {
4092       for (unsigned i = 0; i < vec.length (); ++i)
4093         vec[perm[i]] = saved[i];
4094       for (unsigned i = 0; i < vec.length (); ++i)
4095         gcc_assert (vec[perm[i]] == saved[i]);
4096     }
4097   else
4098     {
4099       for (unsigned i = 0; i < vec.length (); ++i)
4100         vec[i] = saved[perm[i]];
4101       for (unsigned i = 0; i < vec.length (); ++i)
4102         gcc_assert (vec[i] == saved[perm[i]]);
4103     }
4104 }
4105
4106 /* Return the cfg loop that contains NODE.  */
4107
4108 struct loop *
4109 vect_optimize_slp_pass::containing_loop (slp_tree node)
4110 {
4111   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4112   if (!rep)
4113     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4114   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4115 }
4116
4117 /* Return true if UD (an edge from a use to a definition) is associated
4118    with a loop latch edge in the cfg.  */
4119
4120 bool
4121 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4122 {
4123   slp_tree use = m_vertices[ud->src].node;
4124   slp_tree def = m_vertices[ud->dest].node;
4125   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4126       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4127     return false;
4128
4129   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4130   return (is_a<gphi *> (use_rep->stmt)
4131           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4132           && containing_loop (def) == containing_loop (use));
4133 }
4134
4135 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4136    a nonnull data field.  */
4137
4138 void
4139 vect_optimize_slp_pass::build_graph ()
4140 {
4141   m_optimize_size = true;
4142   build_vertices ();
4143
4144   m_slpg = new_graph (m_vertices.length ());
4145   for (slpg_vertex &v : m_vertices)
4146     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4147       if (child)
4148         {
4149           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4150           if (is_cfg_latch_edge (ud))
4151             ud->data = this;
4152         }
4153 }
4154
4155 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4156
4157 static bool
4158 skip_cfg_latch_edges (graph_edge *e)
4159 {
4160   return e->data;
4161 }
4162
4163 /* Create the node partitions.  */
4164
4165 void
4166 vect_optimize_slp_pass::create_partitions ()
4167 {
4168   /* Calculate a postorder of the graph, ignoring edges that correspond
4169      to natural latch edges in the cfg.  Reading the vector from the end
4170      to the beginning gives the reverse postorder.  */
4171   auto_vec<int> initial_rpo;
4172   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4173                false, NULL, skip_cfg_latch_edges);
4174   gcc_assert (initial_rpo.length () == m_vertices.length ());
4175
4176   /* Calculate the strongly connected components of the graph.  */
4177   auto_vec<int> scc_grouping;
4178   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4179
4180   /* Create a new index order in which all nodes from the same SCC are
4181      consecutive.  Use scc_pos to record the index of the first node in
4182      each SCC.  */
4183   auto_vec<unsigned int> scc_pos (num_sccs);
4184   int last_component = -1;
4185   unsigned int node_count = 0;
4186   for (unsigned int node_i : scc_grouping)
4187     {
4188       if (last_component != m_slpg->vertices[node_i].component)
4189         {
4190           last_component = m_slpg->vertices[node_i].component;
4191           gcc_assert (last_component == int (scc_pos.length ()));
4192           scc_pos.quick_push (node_count);
4193         }
4194       node_count += 1;
4195     }
4196   gcc_assert (node_count == initial_rpo.length ()
4197               && last_component + 1 == int (num_sccs));
4198
4199   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4200      inside each SCC following the RPO we calculated above.  The fact that
4201      we ignored natural latch edges when calculating the RPO should ensure
4202      that, for natural loop nests:
4203
4204      - the first node that we encounter in a cfg loop is the loop header phi
4205      - the loop header phis are in dominance order
4206
4207      Arranging for this is an optimization (see below) rather than a
4208      correctness issue.  Unnatural loops with a tangled mess of backedges
4209      will still work correctly, but might give poorer results.
4210
4211      Also update scc_pos so that it gives 1 + the index of the last node
4212      in the SCC.  */
4213   m_partitioned_nodes.safe_grow (node_count);
4214   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4215     {
4216       unsigned int node_i = initial_rpo[old_i];
4217       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4218       m_partitioned_nodes[new_i] = node_i;
4219     }
4220
4221   /* When optimizing for speed, partition each SCC based on the containing
4222      cfg loop. The order we constructed above should ensure that, for natural
4223      cfg loops, we'll create sub-SCC partitions for outer loops before
4224      the corresponding sub-SCC partitions for inner loops.  Similarly,
4225      when one sibling loop A dominates another sibling loop B, we should
4226      create a sub-SCC partition for A before a sub-SCC partition for B.
4227
4228      As above, nothing depends for correctness on whether this achieves
4229      a natural nesting, but we should get better results when it does.  */
4230   m_partitions.reserve (m_vertices.length ());
4231   unsigned int next_partition_i = 0;
4232   hash_map<struct loop *, int> loop_partitions;
4233   unsigned int rpo_begin = 0;
4234   unsigned int num_partitioned_nodes = 0;
4235   for (unsigned int rpo_end : scc_pos)
4236     {
4237       loop_partitions.empty ();
4238       unsigned int partition_i = next_partition_i;
4239       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4240         {
4241           /* Handle externals and constants optimistically throughout.
4242              But treat existing vectors as fixed since we do not handle
4243              permuting them.  */
4244           unsigned int node_i = m_partitioned_nodes[rpo_i];
4245           auto &vertex = m_vertices[node_i];
4246           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4247                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4248               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4249             vertex.partition = -1;
4250           else
4251             {
4252               bool existed;
4253               if (m_optimize_size)
4254                 existed = next_partition_i > partition_i;
4255               else
4256                 {
4257                   struct loop *loop = containing_loop (vertex.node);
4258                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4259                   if (!existed)
4260                     entry = next_partition_i;
4261                   partition_i = entry;
4262                 }
4263               if (!existed)
4264                 {
4265                   m_partitions.quick_push (slpg_partition_info ());
4266                   next_partition_i += 1;
4267                 }
4268               vertex.partition = partition_i;
4269               num_partitioned_nodes += 1;
4270               m_partitions[partition_i].node_end += 1;
4271             }
4272         }
4273       rpo_begin = rpo_end;
4274     }
4275
4276   /* Assign ranges of consecutive node indices to each partition,
4277      in partition order.  Start with node_end being the same as
4278      node_begin so that the next loop can use it as a counter.  */
4279   unsigned int node_begin = 0;
4280   for (auto &partition : m_partitions)
4281     {
4282       partition.node_begin = node_begin;
4283       node_begin += partition.node_end;
4284       partition.node_end = partition.node_begin;
4285     }
4286   gcc_assert (node_begin == num_partitioned_nodes);
4287
4288   /* Finally build the list of nodes in partition order.  */
4289   m_partitioned_nodes.truncate (num_partitioned_nodes);
4290   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4291     {
4292       int partition_i = m_vertices[node_i].partition;
4293       if (partition_i >= 0)
4294         {
4295           unsigned int order_i = m_partitions[partition_i].node_end++;
4296           m_partitioned_nodes[order_i] = node_i;
4297         }
4298     }
4299 }
4300
4301 /* Look for edges from earlier partitions into node NODE_I and edges from
4302    node NODE_I into later partitions.  Call:
4303
4304       FN (ud, other_node_i)
4305
4306    for each such use-to-def edge ud, where other_node_i is the node at the
4307    other end of the edge.  */
4308
4309 template<typename T>
4310 void
4311 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4312 {
4313   int partition_i = m_vertices[node_i].partition;
4314   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4315        pred; pred = pred->pred_next)
4316     {
4317       int src_partition_i = m_vertices[pred->src].partition;
4318       if (src_partition_i >= 0 && src_partition_i != partition_i)
4319         fn (pred, pred->src);
4320     }
4321   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4322        succ; succ = succ->succ_next)
4323     {
4324       int dest_partition_i = m_vertices[succ->dest].partition;
4325       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4326         fn (succ, succ->dest);
4327     }
4328 }
4329
4330 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4331    that NODE would operate on.  This test is independent of NODE's actual
4332    operation.  */
4333
4334 bool
4335 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4336                                               unsigned int layout_i)
4337 {
4338   if (layout_i == 0)
4339     return true;
4340
4341   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4342     return false;
4343
4344   return true;
4345 }
4346
4347 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4348    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4349    layouts is incompatible with NODE or if the change is not possible for
4350    some other reason.
4351
4352    The properties taken from NODE include the number of lanes and the
4353    vector type.  The actual operation doesn't matter.  */
4354
4355 int
4356 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4357                                             unsigned int from_layout_i,
4358                                             unsigned int to_layout_i)
4359 {
4360   if (!is_compatible_layout (node, from_layout_i)
4361       || !is_compatible_layout (node, to_layout_i))
4362     return -1;
4363
4364   if (from_layout_i == to_layout_i)
4365     return 0;
4366
4367   auto_vec<slp_tree, 1> children (1);
4368   children.quick_push (node);
4369   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4370   if (from_layout_i > 0)
4371     for (unsigned int i : m_perms[from_layout_i])
4372       perm.quick_push ({ 0, i });
4373   else
4374     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4375       perm.quick_push ({ 0, i });
4376   if (to_layout_i > 0)
4377     vect_slp_permute (m_perms[to_layout_i], perm, true);
4378   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4379                                                children, false);
4380   if (count >= 0)
4381     return MAX (count, 1);
4382
4383   /* ??? In principle we could try changing via layout 0, giving two
4384      layout changes rather than 1.  Doing that would require
4385      corresponding support in get_result_with_layout.  */
4386   return -1;
4387 }
4388
4389 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4390
4391 inline slpg_partition_layout_costs &
4392 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4393                                                 unsigned int layout_i)
4394 {
4395   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4396 }
4397
4398 /* Change PERM in one of two ways:
4399
4400    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4401      chosen for child I of NODE.
4402
4403    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4404
4405    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4406
4407 void
4408 vect_optimize_slp_pass::
4409 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4410                         int in_layout_i, unsigned int out_layout_i)
4411 {
4412   for (auto &entry : perm)
4413     {
4414       int this_in_layout_i = in_layout_i;
4415       if (this_in_layout_i < 0)
4416         {
4417           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4418           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4419           this_in_layout_i = m_partitions[in_partition_i].layout;
4420         }
4421       if (this_in_layout_i > 0)
4422         entry.second = m_perms[this_in_layout_i][entry.second];
4423     }
4424   if (out_layout_i > 0)
4425     vect_slp_permute (m_perms[out_layout_i], perm, true);
4426 }
4427
4428 /* Check whether the target allows NODE to be rearranged so that the node's
4429    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4430    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4431
4432    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4433    NODE can adapt to the layout changes that have (perhaps provisionally)
4434    been chosen for NODE's children, so that no extra permutations are
4435    needed on either the input or the output of NODE.
4436
4437    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4438    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4439
4440    IN_LAYOUT_I has no meaning for other types of node.
4441
4442    Keeping the node as-is is always valid.  If the target doesn't appear
4443    to support the node as-is, but might realistically support other layouts,
4444    then layout 0 instead has the cost of a worst-case permutation.  On the
4445    one hand, this ensures that every node has at least one valid layout,
4446    avoiding what would otherwise be an awkward special case.  On the other,
4447    it still encourages the pass to change an invalid pre-existing layout
4448    choice into a valid one.  */
4449
4450 int
4451 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4452                                             unsigned int out_layout_i)
4453 {
4454   const int fallback_cost = 1;
4455
4456   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4457     {
4458       auto_lane_permutation_t tmp_perm;
4459       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4460
4461       /* Check that the child nodes support the chosen layout.  Checking
4462          the first child is enough, since any second child would have the
4463          same shape.  */
4464       auto first_child = SLP_TREE_CHILDREN (node)[0];
4465       if (in_layout_i > 0
4466           && !is_compatible_layout (first_child, in_layout_i))
4467         return -1;
4468
4469       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4470       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4471                                                   node, tmp_perm,
4472                                                   SLP_TREE_CHILDREN (node),
4473                                                   false);
4474       if (count < 0)
4475         {
4476           if (in_layout_i == 0 && out_layout_i == 0)
4477             {
4478               /* Use the fallback cost if the node could in principle support
4479                  some nonzero layout for both the inputs and the outputs.
4480                  Otherwise assume that the node will be rejected later
4481                  and rebuilt from scalars.  */
4482               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4483                 return fallback_cost;
4484               return 0;
4485             }
4486           return -1;
4487         }
4488
4489       /* We currently have no way of telling whether the new layout is cheaper
4490          or more expensive than the old one.  But at least in principle,
4491          it should be worth making zero permutations (whole-vector shuffles)
4492          cheaper than real permutations, in case the pass is able to remove
4493          the latter.  */
4494       return count == 0 ? 0 : 1;
4495     }
4496
4497   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4498   if (rep
4499       && STMT_VINFO_DATA_REF (rep)
4500       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4501       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4502     {
4503       auto_load_permutation_t tmp_perm;
4504       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4505       if (out_layout_i > 0)
4506         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4507
4508       poly_uint64 vf = 1;
4509       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4510         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4511       unsigned int n_perms;
4512       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4513                                            nullptr, vf, true, false, &n_perms))
4514         {
4515           auto rep = SLP_TREE_REPRESENTATIVE (node);
4516           if (out_layout_i == 0)
4517             {
4518               /* Use the fallback cost if the load is an N-to-N permutation.
4519                  Otherwise assume that the node will be rejected later
4520                  and rebuilt from scalars.  */
4521               if (STMT_VINFO_GROUPED_ACCESS (rep)
4522                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4523                       == SLP_TREE_LANES (node)))
4524                 return fallback_cost;
4525               return 0;
4526             }
4527           return -1;
4528         }
4529
4530       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4531       return n_perms == 0 ? 0 : 1;
4532     }
4533
4534   return 0;
4535 }
4536
4537 /* Decide which element layouts we should consider using.  Calculate the
4538    weights associated with inserting layout changes on partition edges.
4539    Also mark partitions that cannot change layout, by setting their
4540    layout to zero.  */
4541
4542 void
4543 vect_optimize_slp_pass::start_choosing_layouts ()
4544 {
4545   /* Used to assign unique permutation indices.  */
4546   using perm_hash = unbounded_hashmap_traits<
4547     vec_free_hash_base<int_hash_base<unsigned>>,
4548     int_hash<int, -1, -2>
4549   >;
4550   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4551
4552   /* Layout 0 is "no change".  */
4553   m_perms.safe_push (vNULL);
4554
4555   /* Create layouts from existing permutations.  */
4556   auto_load_permutation_t tmp_perm;
4557   for (unsigned int node_i : m_partitioned_nodes)
4558     {
4559       /* Leafs also double as entries to the reverse graph.  Allow the
4560          layout of those to be changed.  */
4561       auto &vertex = m_vertices[node_i];
4562       auto &partition = m_partitions[vertex.partition];
4563       if (!m_slpg->vertices[node_i].succ)
4564         partition.layout = 0;
4565
4566       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4567       slp_tree node = vertex.node;
4568       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4569       slp_tree child;
4570       unsigned HOST_WIDE_INT imin, imax = 0;
4571       bool any_permute = false;
4572       tmp_perm.truncate (0);
4573       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4574         {
4575           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4576              unpermuted, record a layout that reverses this permutation.
4577
4578              We would need more work to cope with loads that are internally
4579              permuted and also have inputs (such as masks for
4580              IFN_MASK_LOADs).  */
4581           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4582           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4583             continue;
4584           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4585           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4586           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4587         }
4588       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4589                && SLP_TREE_CHILDREN (node).length () == 1
4590                && (child = SLP_TREE_CHILDREN (node)[0])
4591                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4592                    .is_constant (&imin)))
4593         {
4594           /* If the child has the same vector size as this node,
4595              reversing the permutation can make the permutation a no-op.
4596              In other cases it can change a true permutation into a
4597              full-vector extract.  */
4598           tmp_perm.reserve (SLP_TREE_LANES (node));
4599           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4600             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4601         }
4602       else
4603         continue;
4604
4605       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4606         {
4607           unsigned idx = tmp_perm[j];
4608           imin = MIN (imin, idx);
4609           imax = MAX (imax, idx);
4610           if (idx - tmp_perm[0] != j)
4611             any_permute = true;
4612         }
4613       /* If the span doesn't match we'd disrupt VF computation, avoid
4614          that for now.  */
4615       if (imax - imin + 1 != SLP_TREE_LANES (node))
4616         continue;
4617       /* If there's no permute no need to split one out.  In this case
4618          we can consider turning a load into a permuted load, if that
4619          turns out to be cheaper than alternatives.  */
4620       if (!any_permute)
4621         {
4622           partition.layout = -1;
4623           continue;
4624         }
4625
4626       /* For now only handle true permutes, like
4627          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4628          when permuting constants and invariants keeping the permute
4629          bijective.  */
4630       auto_sbitmap load_index (SLP_TREE_LANES (node));
4631       bitmap_clear (load_index);
4632       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4633         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4634       unsigned j;
4635       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4636         if (!bitmap_bit_p (load_index, j))
4637           break;
4638       if (j != SLP_TREE_LANES (node))
4639         continue;
4640
4641       vec<unsigned> perm = vNULL;
4642       perm.safe_grow (SLP_TREE_LANES (node), true);
4643       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4644         perm[j] = tmp_perm[j] - imin;
4645
4646       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4647         {
4648           /* Continue to use existing layouts, but don't add any more.  */
4649           int *entry = layout_ids.get (perm);
4650           partition.layout = entry ? *entry : 0;
4651           perm.release ();
4652         }
4653       else
4654         {
4655           bool existed;
4656           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4657           if (existed)
4658             perm.release ();
4659           else
4660             {
4661               layout_i = m_perms.length ();
4662               m_perms.safe_push (perm);
4663             }
4664           partition.layout = layout_i;
4665         }
4666     }
4667
4668   /* Initially assume that every layout is possible and has zero cost
4669      in every partition.  */
4670   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4671                                               * m_perms.length ());
4672
4673   /* We have to mark outgoing permutations facing non-reduction graph
4674      entries that are not represented as to be materialized.  */
4675   for (slp_instance instance : m_vinfo->slp_instances)
4676     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4677       {
4678         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4679         m_partitions[m_vertices[node_i].partition].layout = 0;
4680       }
4681
4682   /* Check which layouts each node and partition can handle.  Calculate the
4683      weights associated with inserting layout changes on edges.  */
4684   for (unsigned int node_i : m_partitioned_nodes)
4685     {
4686       auto &vertex = m_vertices[node_i];
4687       auto &partition = m_partitions[vertex.partition];
4688       slp_tree node = vertex.node;
4689
4690       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4691         {
4692           vertex.weight = vect_slp_node_weight (node);
4693
4694           /* We do not handle stores with a permutation, so all
4695              incoming permutations must have been materialized.
4696
4697              We also don't handle masked grouped loads, which lack a
4698              permutation vector.  In this case the memory locations
4699              form an implicit second input to the loads, on top of the
4700              explicit mask input, and the memory input's layout cannot
4701              be changed.
4702
4703              On the other hand, we do support permuting gather loads and
4704              masked gather loads, where each scalar load is independent
4705              of the others.  This can be useful if the address/index input
4706              benefits from permutation.  */
4707           if (STMT_VINFO_DATA_REF (rep)
4708               && STMT_VINFO_GROUPED_ACCESS (rep)
4709               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4710             partition.layout = 0;
4711
4712           /* We cannot change the layout of an operation that is
4713              not independent on lanes.  Note this is an explicit
4714              negative list since that's much shorter than the respective
4715              positive one but it's critical to keep maintaining it.  */
4716           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4717             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4718               {
4719               case CFN_COMPLEX_ADD_ROT90:
4720               case CFN_COMPLEX_ADD_ROT270:
4721               case CFN_COMPLEX_MUL:
4722               case CFN_COMPLEX_MUL_CONJ:
4723               case CFN_VEC_ADDSUB:
4724               case CFN_VEC_FMADDSUB:
4725               case CFN_VEC_FMSUBADD:
4726                 partition.layout = 0;
4727               default:;
4728               }
4729         }
4730
4731       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4732         {
4733           auto &other_vertex = m_vertices[other_node_i];
4734
4735           /* Count the number of edges from earlier partitions and the number
4736              of edges to later partitions.  */
4737           if (other_vertex.partition < vertex.partition)
4738             partition.in_degree += 1;
4739           else
4740             partition.out_degree += 1;
4741
4742           /* If the current node uses the result of OTHER_NODE_I, accumulate
4743              the effects of that.  */
4744           if (ud->src == int (node_i))
4745             {
4746               other_vertex.out_weight += vertex.weight;
4747               other_vertex.out_degree += 1;
4748             }
4749         };
4750       for_each_partition_edge (node_i, process_edge);
4751     }
4752 }
4753
4754 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4755    its current (provisional) choice of layout.  The inputs do not necessarily
4756    have the same layout as each other.  */
4757
4758 slpg_layout_cost
4759 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4760 {
4761   auto &vertex = m_vertices[node_i];
4762   slpg_layout_cost cost;
4763   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4764     {
4765       auto &other_vertex = m_vertices[other_node_i];
4766       if (other_vertex.partition < vertex.partition)
4767         {
4768           auto &other_partition = m_partitions[other_vertex.partition];
4769           auto &other_costs = partition_layout_costs (other_vertex.partition,
4770                                                       other_partition.layout);
4771           slpg_layout_cost this_cost = other_costs.in_cost;
4772           this_cost.add_serial_cost (other_costs.internal_cost);
4773           this_cost.split (other_partition.out_degree);
4774           cost.add_parallel_cost (this_cost);
4775         }
4776     };
4777   for_each_partition_edge (node_i, add_cost);
4778   return cost;
4779 }
4780
4781 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4782    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4783    slpg_layout_cost::impossible () if the change isn't possible.  */
4784
4785 slpg_layout_cost
4786 vect_optimize_slp_pass::
4787 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4788                   unsigned int layout2_i)
4789 {
4790   auto &def_vertex = m_vertices[ud->dest];
4791   auto &use_vertex = m_vertices[ud->src];
4792   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4793   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4794   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4795                                     use_layout_i);
4796   if (factor < 0)
4797     return slpg_layout_cost::impossible ();
4798
4799   /* We have a choice of putting the layout change at the site of the
4800      definition or at the site of the use.  Prefer the former when
4801      optimizing for size or when the execution frequency of the
4802      definition is no greater than the combined execution frequencies of
4803      the uses.  When putting the layout change at the site of the definition,
4804      divvy up the cost among all consumers.  */
4805   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4806     {
4807       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4808       cost.split (def_vertex.out_degree);
4809       return cost;
4810     }
4811   return { use_vertex.weight * factor, m_optimize_size };
4812 }
4813
4814 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4815    partition; FROM_NODE_I could be the definition node or the use node.
4816    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4817    Return the cost of any necessary fix-ups on edge UD, or return
4818    slpg_layout_cost::impossible () if the change isn't possible.
4819
4820    At this point, FROM_NODE_I's partition has chosen the cheapest
4821    layout based on the information available so far, but this choice
4822    is only provisional.  */
4823
4824 slpg_layout_cost
4825 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4826                                       unsigned int to_layout_i)
4827 {
4828   auto &from_vertex = m_vertices[from_node_i];
4829   unsigned int from_partition_i = from_vertex.partition;
4830   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4831   gcc_assert (from_partition.layout >= 0);
4832
4833   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4834      with its current layout preference.  */
4835   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4836   auto edge_cost = edge_layout_cost (ud, from_node_i,
4837                                      from_partition.layout, to_layout_i);
4838   if (edge_cost.is_possible ())
4839     {
4840       auto &from_costs = partition_layout_costs (from_partition_i,
4841                                                  from_partition.layout);
4842       cost = from_costs.in_cost;
4843       cost.add_serial_cost (from_costs.internal_cost);
4844       cost.split (from_partition.out_degree);
4845       cost.add_serial_cost (edge_cost);
4846     }
4847
4848   /* Take the minimum of that cost and the cost that applies if
4849      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4850   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4851                                                       to_layout_i);
4852   if (direct_layout_costs.is_possible ())
4853     {
4854       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4855       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4856       direct_cost.split (from_partition.out_degree);
4857       if (!cost.is_possible ()
4858           || direct_cost.is_better_than (cost, m_optimize_size))
4859         cost = direct_cost;
4860     }
4861
4862   return cost;
4863 }
4864
4865 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4866    partition; TO_NODE_I could be the definition node or the use node.
4867    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4868    return the cost of any necessary fix-ups on edge UD, or
4869    slpg_layout_cost::impossible () if the choice cannot be made.
4870
4871    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4872
4873 slpg_layout_cost
4874 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4875                                        unsigned int from_layout_i)
4876 {
4877   auto &to_vertex = m_vertices[to_node_i];
4878   unsigned int to_partition_i = to_vertex.partition;
4879   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4880   gcc_assert (to_partition.layout >= 0);
4881
4882   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4883      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4884      any other inputs keep their current choice of layout.  */
4885   auto &to_costs = partition_layout_costs (to_partition_i,
4886                                            to_partition.layout);
4887   if (ud->src == int (to_node_i)
4888       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4889     {
4890       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4891       auto old_layout = from_partition.layout;
4892       from_partition.layout = from_layout_i;
4893       int factor = internal_node_cost (to_vertex.node, -1,
4894                                        to_partition.layout);
4895       from_partition.layout = old_layout;
4896       if (factor >= 0)
4897         {
4898           slpg_layout_cost cost = to_costs.out_cost;
4899           cost.add_serial_cost ({ to_vertex.weight * factor,
4900                                   m_optimize_size });
4901           cost.split (to_partition.in_degree);
4902           return cost;
4903         }
4904     }
4905
4906   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4907   auto edge_cost = edge_layout_cost (ud, to_node_i,
4908                                      to_partition.layout, from_layout_i);
4909   if (edge_cost.is_possible ())
4910     {
4911       slpg_layout_cost cost = to_costs.out_cost;
4912       cost.add_serial_cost (to_costs.internal_cost);
4913       cost.split (to_partition.in_degree);
4914       cost.add_serial_cost (edge_cost);
4915       return cost;
4916     }
4917
4918   return slpg_layout_cost::impossible ();
4919 }
4920
4921 /* Make a forward pass through the partitions, accumulating input costs.
4922    Make a tentative (provisional) choice of layout for each partition,
4923    ensuring that this choice still allows later partitions to keep
4924    their original layout.  */
4925
4926 void
4927 vect_optimize_slp_pass::forward_pass ()
4928 {
4929   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4930        ++partition_i)
4931     {
4932       auto &partition = m_partitions[partition_i];
4933
4934       /* If the partition consists of a single VEC_PERM_EXPR, precompute
4935          the incoming cost that would apply if every predecessor partition
4936          keeps its current layout.  This is used within the loop below.  */
4937       slpg_layout_cost in_cost;
4938       slp_tree single_node = nullptr;
4939       if (partition.node_end == partition.node_begin + 1)
4940         {
4941           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4942           single_node = m_vertices[node_i].node;
4943           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4944             in_cost = total_in_cost (node_i);
4945         }
4946
4947       /* Go through the possible layouts.  Decide which ones are valid
4948          for this partition and record which of the valid layouts has
4949          the lowest cost.  */
4950       unsigned int min_layout_i = 0;
4951       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4952       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4953         {
4954           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4955           if (!layout_costs.is_possible ())
4956             continue;
4957
4958           /* If the recorded layout is already 0 then the layout cannot
4959              change.  */
4960           if (partition.layout == 0 && layout_i != 0)
4961             {
4962               layout_costs.mark_impossible ();
4963               continue;
4964             }
4965
4966           bool is_possible = true;
4967           for (unsigned int order_i = partition.node_begin;
4968                order_i < partition.node_end; ++order_i)
4969             {
4970               unsigned int node_i = m_partitioned_nodes[order_i];
4971               auto &vertex = m_vertices[node_i];
4972
4973               /* Reject the layout if it is individually incompatible
4974                  with any node in the partition.  */
4975               if (!is_compatible_layout (vertex.node, layout_i))
4976                 {
4977                   is_possible = false;
4978                   break;
4979                 }
4980
4981               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4982                 {
4983                   auto &other_vertex = m_vertices[other_node_i];
4984                   if (other_vertex.partition < vertex.partition)
4985                     {
4986                       /* Accumulate the incoming costs from earlier
4987                          partitions, plus the cost of any layout changes
4988                          on UD itself.  */
4989                       auto cost = forward_cost (ud, other_node_i, layout_i);
4990                       if (!cost.is_possible ())
4991                         is_possible = false;
4992                       else
4993                         layout_costs.in_cost.add_parallel_cost (cost);
4994                     }
4995                   else
4996                     /* Reject the layout if it would make layout 0 impossible
4997                        for later partitions.  This amounts to testing that the
4998                        target supports reversing the layout change on edges
4999                        to later partitions.
5000
5001                        In principle, it might be possible to push a layout
5002                        change all the way down a graph, so that it never
5003                        needs to be reversed and so that the target doesn't
5004                        need to support the reverse operation.  But it would
5005                        be awkward to bail out if we hit a partition that
5006                        does not support the new layout, especially since
5007                        we are not dealing with a lattice.  */
5008                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5009                                                      layout_i).is_possible ();
5010                 };
5011               for_each_partition_edge (node_i, add_cost);
5012
5013               /* Accumulate the cost of using LAYOUT_I within NODE,
5014                  both for the inputs and the outputs.  */
5015               int factor = internal_node_cost (vertex.node, layout_i,
5016                                                layout_i);
5017               if (factor < 0)
5018                 {
5019                   is_possible = false;
5020                   break;
5021                 }
5022               else if (factor)
5023                 layout_costs.internal_cost.add_serial_cost
5024                   ({ vertex.weight * factor, m_optimize_size });
5025             }
5026           if (!is_possible)
5027             {
5028               layout_costs.mark_impossible ();
5029               continue;
5030             }
5031
5032           /* Combine the incoming and partition-internal costs.  */
5033           slpg_layout_cost combined_cost = layout_costs.in_cost;
5034           combined_cost.add_serial_cost (layout_costs.internal_cost);
5035
5036           /* If this partition consists of a single VEC_PERM_EXPR, see
5037              if the VEC_PERM_EXPR can be changed to support output layout
5038              LAYOUT_I while keeping all the provisional choices of input
5039              layout.  */
5040           if (single_node
5041               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5042             {
5043               int factor = internal_node_cost (single_node, -1, layout_i);
5044               if (factor >= 0)
5045                 {
5046                   auto weight = m_vertices[single_node->vertex].weight;
5047                   slpg_layout_cost internal_cost
5048                     = { weight * factor, m_optimize_size };
5049
5050                   slpg_layout_cost alt_cost = in_cost;
5051                   alt_cost.add_serial_cost (internal_cost);
5052                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5053                     {
5054                       combined_cost = alt_cost;
5055                       layout_costs.in_cost = in_cost;
5056                       layout_costs.internal_cost = internal_cost;
5057                     }
5058                 }
5059             }
5060
5061           /* Record the layout with the lowest cost.  Prefer layout 0 in
5062              the event of a tie between it and another layout.  */
5063           if (!min_layout_cost.is_possible ()
5064               || combined_cost.is_better_than (min_layout_cost,
5065                                                m_optimize_size))
5066             {
5067               min_layout_i = layout_i;
5068               min_layout_cost = combined_cost;
5069             }
5070         }
5071
5072       /* This loop's handling of earlier partitions should ensure that
5073          choosing the original layout for the current partition is no
5074          less valid than it was in the original graph, even with the
5075          provisional layout choices for those earlier partitions.  */
5076       gcc_assert (min_layout_cost.is_possible ());
5077       partition.layout = min_layout_i;
5078     }
5079 }
5080
5081 /* Make a backward pass through the partitions, accumulating output costs.
5082    Make a final choice of layout for each partition.  */
5083
5084 void
5085 vect_optimize_slp_pass::backward_pass ()
5086 {
5087   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5088     {
5089       auto &partition = m_partitions[partition_i];
5090
5091       unsigned int min_layout_i = 0;
5092       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5093       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5094         {
5095           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5096           if (!layout_costs.is_possible ())
5097             continue;
5098
5099           /* Accumulate the costs from successor partitions.  */
5100           bool is_possible = true;
5101           for (unsigned int order_i = partition.node_begin;
5102                order_i < partition.node_end; ++order_i)
5103             {
5104               unsigned int node_i = m_partitioned_nodes[order_i];
5105               auto &vertex = m_vertices[node_i];
5106               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5107                 {
5108                   auto &other_vertex = m_vertices[other_node_i];
5109                   auto &other_partition = m_partitions[other_vertex.partition];
5110                   if (other_vertex.partition > vertex.partition)
5111                     {
5112                       /* Accumulate the incoming costs from later
5113                          partitions, plus the cost of any layout changes
5114                          on UD itself.  */
5115                       auto cost = backward_cost (ud, other_node_i, layout_i);
5116                       if (!cost.is_possible ())
5117                         is_possible = false;
5118                       else
5119                         layout_costs.out_cost.add_parallel_cost (cost);
5120                     }
5121                   else
5122                     /* Make sure that earlier partitions can (if necessary
5123                        or beneficial) keep the layout that they chose in
5124                        the forward pass.  This ensures that there is at
5125                        least one valid choice of layout.  */
5126                     is_possible &= edge_layout_cost (ud, other_node_i,
5127                                                      other_partition.layout,
5128                                                      layout_i).is_possible ();
5129                 };
5130               for_each_partition_edge (node_i, add_cost);
5131             }
5132           if (!is_possible)
5133             {
5134               layout_costs.mark_impossible ();
5135               continue;
5136             }
5137
5138           /* Locally combine the costs from the forward and backward passes.
5139              (This combined cost is not passed on, since that would lead
5140              to double counting.)  */
5141           slpg_layout_cost combined_cost = layout_costs.in_cost;
5142           combined_cost.add_serial_cost (layout_costs.internal_cost);
5143           combined_cost.add_serial_cost (layout_costs.out_cost);
5144
5145           /* Record the layout with the lowest cost.  Prefer layout 0 in
5146              the event of a tie between it and another layout.  */
5147           if (!min_layout_cost.is_possible ()
5148               || combined_cost.is_better_than (min_layout_cost,
5149                                                m_optimize_size))
5150             {
5151               min_layout_i = layout_i;
5152               min_layout_cost = combined_cost;
5153             }
5154         }
5155
5156       gcc_assert (min_layout_cost.is_possible ());
5157       partition.layout = min_layout_i;
5158     }
5159 }
5160
5161 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5162    NODE already has the layout that was selected for its partition.  */
5163
5164 slp_tree
5165 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5166                                                 unsigned int to_layout_i)
5167 {
5168   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5169   slp_tree result = m_node_layouts[result_i];
5170   if (result)
5171     return result;
5172
5173   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5174       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5175     {
5176       /* If the vector is uniform or unchanged, there's nothing to do.  */
5177       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5178         result = node;
5179       else
5180         {
5181           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5182           result = vect_create_new_slp_node (scalar_ops);
5183           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5184         }
5185     }
5186   else
5187     {
5188       unsigned int partition_i = m_vertices[node->vertex].partition;
5189       unsigned int from_layout_i = m_partitions[partition_i].layout;
5190       if (from_layout_i == to_layout_i)
5191         return node;
5192
5193       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5194          permutation instead of a serial one.  Leave the new permutation
5195          in TMP_PERM on success.  */
5196       auto_lane_permutation_t tmp_perm;
5197       unsigned int num_inputs = 1;
5198       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5199         {
5200           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5201           if (from_layout_i != 0)
5202             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5203           if (to_layout_i != 0)
5204             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5205           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5206                                               tmp_perm,
5207                                               SLP_TREE_CHILDREN (node),
5208                                               false) >= 0)
5209             num_inputs = SLP_TREE_CHILDREN (node).length ();
5210           else
5211             tmp_perm.truncate (0);
5212         }
5213
5214       if (dump_enabled_p ())
5215         {
5216           if (tmp_perm.length () > 0)
5217             dump_printf_loc (MSG_NOTE, vect_location,
5218                              "duplicating permutation node %p with"
5219                              " layout %d\n",
5220                              (void *) node, to_layout_i);
5221           else
5222             dump_printf_loc (MSG_NOTE, vect_location,
5223                              "inserting permutation node in place of %p\n",
5224                              (void *) node);
5225         }
5226
5227       unsigned int num_lanes = SLP_TREE_LANES (node);
5228       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5229       if (SLP_TREE_SCALAR_STMTS (node).length ())
5230         {
5231           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5232           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5233           if (from_layout_i != 0)
5234             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5235           if (to_layout_i != 0)
5236             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5237         }
5238       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5239       SLP_TREE_LANES (result) = num_lanes;
5240       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5241       result->vertex = -1;
5242
5243       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5244       if (tmp_perm.length ())
5245         {
5246           lane_perm.safe_splice (tmp_perm);
5247           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5248         }
5249       else
5250         {
5251           lane_perm.create (num_lanes);
5252           for (unsigned j = 0; j < num_lanes; ++j)
5253             lane_perm.quick_push ({ 0, j });
5254           if (from_layout_i != 0)
5255             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5256           if (to_layout_i != 0)
5257             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5258           SLP_TREE_CHILDREN (result).safe_push (node);
5259         }
5260       for (slp_tree child : SLP_TREE_CHILDREN (result))
5261         child->refcnt++;
5262     }
5263   m_node_layouts[result_i] = result;
5264   return result;
5265 }
5266
5267 /* Apply the chosen vector layouts to the SLP graph.  */
5268
5269 void
5270 vect_optimize_slp_pass::materialize ()
5271 {
5272   /* We no longer need the costs, so avoid having two O(N * P) arrays
5273      live at the same time.  */
5274   m_partition_layout_costs.release ();
5275   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5276
5277   auto_sbitmap fully_folded (m_vertices.length ());
5278   bitmap_clear (fully_folded);
5279   for (unsigned int node_i : m_partitioned_nodes)
5280     {
5281       auto &vertex = m_vertices[node_i];
5282       slp_tree node = vertex.node;
5283       int layout_i = m_partitions[vertex.partition].layout;
5284       gcc_assert (layout_i >= 0);
5285
5286       /* Rearrange the scalar statements to match the chosen layout.  */
5287       if (layout_i > 0)
5288         vect_slp_permute (m_perms[layout_i],
5289                           SLP_TREE_SCALAR_STMTS (node), true);
5290
5291       /* Update load and lane permutations.  */
5292       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5293         {
5294           /* First try to absorb the input vector layouts.  If that fails,
5295              force the inputs to have layout LAYOUT_I too.  We checked that
5296              that was possible before deciding to use nonzero output layouts.
5297              (Note that at this stage we don't really have any guarantee that
5298              the target supports the original VEC_PERM_EXPR.)  */
5299           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5300           auto_lane_permutation_t tmp_perm;
5301           tmp_perm.safe_splice (perm);
5302           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5303           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5304                                               tmp_perm,
5305                                               SLP_TREE_CHILDREN (node),
5306                                               false) >= 0)
5307             {
5308               if (dump_enabled_p ()
5309                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5310                                   perm.begin ()))
5311                 dump_printf_loc (MSG_NOTE, vect_location,
5312                                  "absorbing input layouts into %p\n",
5313                                  (void *) node);
5314               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5315               bitmap_set_bit (fully_folded, node_i);
5316             }
5317           else
5318             {
5319               /* Not MSG_MISSED because it would make no sense to users.  */
5320               if (dump_enabled_p ())
5321                 dump_printf_loc (MSG_NOTE, vect_location,
5322                                  "failed to absorb input layouts into %p\n",
5323                                  (void *) node);
5324               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5325             }
5326         }
5327       else
5328         {
5329           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5330           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5331           if (layout_i > 0)
5332             /* ???  When we handle non-bijective permutes the idea
5333                is that we can force the load-permutation to be
5334                { min, min + 1, min + 2, ... max }.  But then the
5335                scalar defs might no longer match the lane content
5336                which means wrong-code with live lane vectorization.
5337                So we possibly have to have NULL entries for those.  */
5338             vect_slp_permute (m_perms[layout_i], load_perm, true);
5339         }
5340     }
5341
5342   /* Do this before any nodes disappear, since it involves a walk
5343      over the leaves.  */
5344   remove_redundant_permutations ();
5345
5346   /* Replace each child with a correctly laid-out version.  */
5347   for (unsigned int node_i : m_partitioned_nodes)
5348     {
5349       /* Skip nodes that have already been handled above.  */
5350       if (bitmap_bit_p (fully_folded, node_i))
5351         continue;
5352
5353       auto &vertex = m_vertices[node_i];
5354       int in_layout_i = m_partitions[vertex.partition].layout;
5355       gcc_assert (in_layout_i >= 0);
5356
5357       unsigned j;
5358       slp_tree child;
5359       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5360         {
5361           if (!child)
5362             continue;
5363
5364           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5365           if (new_child != child)
5366             {
5367               vect_free_slp_tree (child);
5368               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5369               new_child->refcnt += 1;
5370             }
5371         }
5372     }
5373 }
5374
5375 /* Elide load permutations that are not necessary.  Such permutations might
5376    be pre-existing, rather than created by the layout optimizations.  */
5377
5378 void
5379 vect_optimize_slp_pass::remove_redundant_permutations ()
5380 {
5381   for (unsigned int node_i : m_leafs)
5382     {
5383       slp_tree node = m_vertices[node_i].node;
5384       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5385         continue;
5386
5387       /* In basic block vectorization we allow any subchain of an interleaving
5388          chain.
5389          FORNOW: not in loop SLP because of realignment complications.  */
5390       if (is_a <bb_vec_info> (m_vinfo))
5391         {
5392           bool subchain_p = true;
5393           stmt_vec_info next_load_info = NULL;
5394           stmt_vec_info load_info;
5395           unsigned j;
5396           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5397             {
5398               if (j != 0
5399                   && (next_load_info != load_info
5400                       || DR_GROUP_GAP (load_info) != 1))
5401                 {
5402                   subchain_p = false;
5403                   break;
5404                 }
5405               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5406             }
5407           if (subchain_p)
5408             {
5409               SLP_TREE_LOAD_PERMUTATION (node).release ();
5410               continue;
5411             }
5412         }
5413       else
5414         {
5415           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5416           stmt_vec_info load_info;
5417           bool this_load_permuted = false;
5418           unsigned j;
5419           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5420             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5421               {
5422                 this_load_permuted = true;
5423                 break;
5424               }
5425           stmt_vec_info first_stmt_info
5426             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5427           if (!this_load_permuted
5428               /* The load requires permutation when unrolling exposes
5429                  a gap either because the group is larger than the SLP
5430                  group-size or because there is a gap between the groups.  */
5431               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5432                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5433                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5434             {
5435               SLP_TREE_LOAD_PERMUTATION (node).release ();
5436               continue;
5437             }
5438         }
5439     }
5440 }
5441
5442 /* Print the partition graph and layout information to the dump file.  */
5443
5444 void
5445 vect_optimize_slp_pass::dump ()
5446 {
5447   dump_printf_loc (MSG_NOTE, vect_location,
5448                    "SLP optimize permutations:\n");
5449   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5450     {
5451       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5452       const char *sep = "";
5453       for (unsigned int idx : m_perms[layout_i])
5454         {
5455           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5456           sep = ", ";
5457         }
5458       dump_printf (MSG_NOTE, " }\n");
5459     }
5460   dump_printf_loc (MSG_NOTE, vect_location,
5461                    "SLP optimize partitions:\n");
5462   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5463        ++partition_i)
5464     {
5465       auto &partition = m_partitions[partition_i];
5466       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5467       dump_printf_loc (MSG_NOTE, vect_location,
5468                        "  partition %d (layout %d):\n",
5469                        partition_i, partition.layout);
5470       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5471       for (unsigned int order_i = partition.node_begin;
5472            order_i < partition.node_end; ++order_i)
5473         {
5474           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5475           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5476                            (void *) vertex.node);
5477           dump_printf_loc (MSG_NOTE, vect_location,
5478                            "          weight: %f\n",
5479                            vertex.weight.to_double ());
5480           if (vertex.out_degree)
5481             dump_printf_loc (MSG_NOTE, vect_location,
5482                              "          out weight: %f (degree %d)\n",
5483                              vertex.out_weight.to_double (),
5484                              vertex.out_degree);
5485           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5486             dump_printf_loc (MSG_NOTE, vect_location,
5487                              "          op: VEC_PERM_EXPR\n");
5488           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5489             dump_printf_loc (MSG_NOTE, vect_location,
5490                              "          op template: %G", rep->stmt);
5491         }
5492       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5493       for (unsigned int order_i = partition.node_begin;
5494            order_i < partition.node_end; ++order_i)
5495         {
5496           unsigned int node_i = m_partitioned_nodes[order_i];
5497           auto &vertex = m_vertices[node_i];
5498           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5499             {
5500               auto &other_vertex = m_vertices[other_node_i];
5501               if (other_vertex.partition < vertex.partition)
5502                 dump_printf_loc (MSG_NOTE, vect_location,
5503                                  "      - %p [%d] --> %p\n",
5504                                  (void *) other_vertex.node,
5505                                  other_vertex.partition,
5506                                  (void *) vertex.node);
5507               else
5508                 dump_printf_loc (MSG_NOTE, vect_location,
5509                                  "      - %p --> [%d] %p\n",
5510                                  (void *) vertex.node,
5511                                  other_vertex.partition,
5512                                  (void *) other_vertex.node);
5513             };
5514           for_each_partition_edge (node_i, print_edge);
5515         }
5516
5517       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5518         {
5519           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5520           if (layout_costs.is_possible ())
5521             {
5522               dump_printf_loc (MSG_NOTE, vect_location,
5523                                "    layout %d:%s\n", layout_i,
5524                                partition.layout == int (layout_i)
5525                                ? " (*)" : "");
5526               slpg_layout_cost combined_cost = layout_costs.in_cost;
5527               combined_cost.add_serial_cost (layout_costs.internal_cost);
5528               combined_cost.add_serial_cost (layout_costs.out_cost);
5529 #define TEMPLATE "{depth: %f, total: %f}"
5530               dump_printf_loc (MSG_NOTE, vect_location,
5531                                "        " TEMPLATE "\n",
5532                                layout_costs.in_cost.depth.to_double (),
5533                                layout_costs.in_cost.total.to_double ());
5534               dump_printf_loc (MSG_NOTE, vect_location,
5535                                "      + " TEMPLATE "\n",
5536                                layout_costs.internal_cost.depth.to_double (),
5537                                layout_costs.internal_cost.total.to_double ());
5538               dump_printf_loc (MSG_NOTE, vect_location,
5539                                "      + " TEMPLATE "\n",
5540                                layout_costs.out_cost.depth.to_double (),
5541                                layout_costs.out_cost.total.to_double ());
5542               dump_printf_loc (MSG_NOTE, vect_location,
5543                                "      = " TEMPLATE "\n",
5544                                combined_cost.depth.to_double (),
5545                                combined_cost.total.to_double ());
5546 #undef TEMPLATE
5547             }
5548           else
5549             dump_printf_loc (MSG_NOTE, vect_location,
5550                              "    layout %d: rejected\n", layout_i);
5551         }
5552     }
5553 }
5554
5555 /* Main entry point for the SLP graph optimization pass.  */
5556
5557 void
5558 vect_optimize_slp_pass::run ()
5559 {
5560   build_graph ();
5561   create_partitions ();
5562   start_choosing_layouts ();
5563   if (m_perms.length () > 1)
5564     {
5565       forward_pass ();
5566       backward_pass ();
5567       if (dump_enabled_p ())
5568         dump ();
5569       materialize ();
5570       while (!m_perms.is_empty ())
5571         m_perms.pop ().release ();
5572     }
5573   else
5574     remove_redundant_permutations ();
5575   free_graph (m_slpg);
5576 }
5577
5578 /* Optimize the SLP graph of VINFO.  */
5579
5580 void
5581 vect_optimize_slp (vec_info *vinfo)
5582 {
5583   if (vinfo->slp_instances.is_empty ())
5584     return;
5585   vect_optimize_slp_pass (vinfo).run ();
5586 }
5587
5588 /* Gather loads reachable from the individual SLP graph entries.  */
5589
5590 void
5591 vect_gather_slp_loads (vec_info *vinfo)
5592 {
5593   unsigned i;
5594   slp_instance instance;
5595   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5596     {
5597       hash_set<slp_tree> visited;
5598       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5599                              SLP_INSTANCE_TREE (instance), visited);
5600     }
5601 }
5602
5603
5604 /* For each possible SLP instance decide whether to SLP it and calculate overall
5605    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5606    least one instance.  */
5607
5608 bool
5609 vect_make_slp_decision (loop_vec_info loop_vinfo)
5610 {
5611   unsigned int i;
5612   poly_uint64 unrolling_factor = 1;
5613   const vec<slp_instance> &slp_instances
5614     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5615   slp_instance instance;
5616   int decided_to_slp = 0;
5617
5618   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5619
5620   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5621     {
5622       /* FORNOW: SLP if you can.  */
5623       /* All unroll factors have the form:
5624
5625            GET_MODE_SIZE (vinfo->vector_mode) * X
5626
5627          for some rational X, so they must have a common multiple.  */
5628       unrolling_factor
5629         = force_common_multiple (unrolling_factor,
5630                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5631
5632       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5633          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5634          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5635       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5636       decided_to_slp++;
5637     }
5638
5639   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5640
5641   if (decided_to_slp && dump_enabled_p ())
5642     {
5643       dump_printf_loc (MSG_NOTE, vect_location,
5644                        "Decided to SLP %d instances. Unrolling factor ",
5645                        decided_to_slp);
5646       dump_dec (MSG_NOTE, unrolling_factor);
5647       dump_printf (MSG_NOTE, "\n");
5648     }
5649
5650   return (decided_to_slp > 0);
5651 }
5652
5653 /* Private data for vect_detect_hybrid_slp.  */
5654 struct vdhs_data
5655 {
5656   loop_vec_info loop_vinfo;
5657   vec<stmt_vec_info> *worklist;
5658 };
5659
5660 /* Walker for walk_gimple_op.  */
5661
5662 static tree
5663 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5664 {
5665   walk_stmt_info *wi = (walk_stmt_info *)data;
5666   vdhs_data *dat = (vdhs_data *)wi->info;
5667
5668   if (wi->is_lhs)
5669     return NULL_TREE;
5670
5671   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5672   if (!def_stmt_info)
5673     return NULL_TREE;
5674   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5675   if (PURE_SLP_STMT (def_stmt_info))
5676     {
5677       if (dump_enabled_p ())
5678         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5679                          def_stmt_info->stmt);
5680       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5681       dat->worklist->safe_push (def_stmt_info);
5682     }
5683
5684   return NULL_TREE;
5685 }
5686
5687 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5688    if so, otherwise pushing it to WORKLIST.  */
5689
5690 static void
5691 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5692                                vec<stmt_vec_info> &worklist,
5693                                stmt_vec_info stmt_info)
5694 {
5695   if (dump_enabled_p ())
5696     dump_printf_loc (MSG_NOTE, vect_location,
5697                      "Processing hybrid candidate : %G", stmt_info->stmt);
5698   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5699   imm_use_iterator iter2;
5700   ssa_op_iter iter1;
5701   use_operand_p use_p;
5702   def_operand_p def_p;
5703   bool any_def = false;
5704   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5705     {
5706       any_def = true;
5707       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5708         {
5709           if (is_gimple_debug (USE_STMT (use_p)))
5710             continue;
5711           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5712           /* An out-of loop use means this is a loop_vect sink.  */
5713           if (!use_info)
5714             {
5715               if (dump_enabled_p ())
5716                 dump_printf_loc (MSG_NOTE, vect_location,
5717                                  "Found loop_vect sink: %G", stmt_info->stmt);
5718               worklist.safe_push (stmt_info);
5719               return;
5720             }
5721           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5722             {
5723               if (dump_enabled_p ())
5724                 dump_printf_loc (MSG_NOTE, vect_location,
5725                                  "Found loop_vect use: %G", use_info->stmt);
5726               worklist.safe_push (stmt_info);
5727               return;
5728             }
5729         }
5730     }
5731   /* No def means this is a loo_vect sink.  */
5732   if (!any_def)
5733     {
5734       if (dump_enabled_p ())
5735         dump_printf_loc (MSG_NOTE, vect_location,
5736                          "Found loop_vect sink: %G", stmt_info->stmt);
5737       worklist.safe_push (stmt_info);
5738       return;
5739     }
5740   if (dump_enabled_p ())
5741     dump_printf_loc (MSG_NOTE, vect_location,
5742                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5743   STMT_SLP_TYPE (stmt_info) = pure_slp;
5744 }
5745
5746 /* Find stmts that must be both vectorized and SLPed.  */
5747
5748 void
5749 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5750 {
5751   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5752
5753   /* All stmts participating in SLP are marked pure_slp, all other
5754      stmts are loop_vect.
5755      First collect all loop_vect stmts into a worklist.
5756      SLP patterns cause not all original scalar stmts to appear in
5757      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5758      Rectify this here and do a backward walk over the IL only considering
5759      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5760      mark them as pure_slp.  */
5761   auto_vec<stmt_vec_info> worklist;
5762   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5763     {
5764       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5765       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5766            gsi_next (&gsi))
5767         {
5768           gphi *phi = gsi.phi ();
5769           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5770           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5771             maybe_push_to_hybrid_worklist (loop_vinfo,
5772                                            worklist, stmt_info);
5773         }
5774       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5775            gsi_prev (&gsi))
5776         {
5777           gimple *stmt = gsi_stmt (gsi);
5778           if (is_gimple_debug (stmt))
5779             continue;
5780           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5781           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5782             {
5783               for (gimple_stmt_iterator gsi2
5784                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5785                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5786                 {
5787                   stmt_vec_info patt_info
5788                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5789                   if (!STMT_SLP_TYPE (patt_info)
5790                       && STMT_VINFO_RELEVANT (patt_info))
5791                     maybe_push_to_hybrid_worklist (loop_vinfo,
5792                                                    worklist, patt_info);
5793                 }
5794               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5795             }
5796           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5797             maybe_push_to_hybrid_worklist (loop_vinfo,
5798                                            worklist, stmt_info);
5799         }
5800     }
5801
5802   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5803      mark any SLP vectorized stmt as hybrid.
5804      ???  We're visiting def stmts N times (once for each non-SLP and
5805      once for each hybrid-SLP use).  */
5806   walk_stmt_info wi;
5807   vdhs_data dat;
5808   dat.worklist = &worklist;
5809   dat.loop_vinfo = loop_vinfo;
5810   memset (&wi, 0, sizeof (wi));
5811   wi.info = (void *)&dat;
5812   while (!worklist.is_empty ())
5813     {
5814       stmt_vec_info stmt_info = worklist.pop ();
5815       /* Since SSA operands are not set up for pattern stmts we need
5816          to use walk_gimple_op.  */
5817       wi.is_lhs = 0;
5818       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5819       /* For gather/scatter make sure to walk the offset operand, that
5820          can be a scaling and conversion away.  */
5821       gather_scatter_info gs_info;
5822       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5823           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5824         {
5825           int dummy;
5826           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5827         }
5828     }
5829 }
5830
5831
5832 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5833
5834 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5835   : vec_info (vec_info::bb, shared),
5836     bbs (_bbs),
5837     roots (vNULL)
5838 {
5839   for (unsigned i = 0; i < bbs.length (); ++i)
5840     {
5841       if (i != 0)
5842         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5843              gsi_next (&si))
5844           {
5845             gphi *phi = si.phi ();
5846             gimple_set_uid (phi, 0);
5847             add_stmt (phi);
5848           }
5849       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5850            !gsi_end_p (gsi); gsi_next (&gsi))
5851         {
5852           gimple *stmt = gsi_stmt (gsi);
5853           gimple_set_uid (stmt, 0);
5854           if (is_gimple_debug (stmt))
5855             continue;
5856           add_stmt (stmt);
5857         }
5858     }
5859 }
5860
5861
5862 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5863    stmts in the basic block.  */
5864
5865 _bb_vec_info::~_bb_vec_info ()
5866 {
5867   /* Reset region marker.  */
5868   for (unsigned i = 0; i < bbs.length (); ++i)
5869     {
5870       if (i != 0)
5871         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5872              gsi_next (&si))
5873           {
5874             gphi *phi = si.phi ();
5875             gimple_set_uid (phi, -1);
5876           }
5877       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5878            !gsi_end_p (gsi); gsi_next (&gsi))
5879         {
5880           gimple *stmt = gsi_stmt (gsi);
5881           gimple_set_uid (stmt, -1);
5882         }
5883     }
5884
5885   for (unsigned i = 0; i < roots.length (); ++i)
5886     {
5887       roots[i].stmts.release ();
5888       roots[i].roots.release ();
5889     }
5890   roots.release ();
5891 }
5892
5893 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5894    given then that child nodes have already been processed, and that
5895    their def types currently match their SLP node's def type.  */
5896
5897 static bool
5898 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5899                                     slp_instance node_instance,
5900                                     stmt_vector_for_cost *cost_vec)
5901 {
5902   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5903
5904   /* Calculate the number of vector statements to be created for the
5905      scalar stmts in this node.  For SLP reductions it is equal to the
5906      number of vector statements in the children (which has already been
5907      calculated by the recursive call).  Otherwise it is the number of
5908      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5909      VF divided by the number of elements in a vector.  */
5910   if (!STMT_VINFO_DATA_REF (stmt_info)
5911       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5912     {
5913       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5914         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5915           {
5916             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5917               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5918             break;
5919           }
5920     }
5921   else
5922     {
5923       poly_uint64 vf;
5924       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5925         vf = loop_vinfo->vectorization_factor;
5926       else
5927         vf = 1;
5928       unsigned int group_size = SLP_TREE_LANES (node);
5929       tree vectype = SLP_TREE_VECTYPE (node);
5930       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5931         = vect_get_num_vectors (vf * group_size, vectype);
5932     }
5933
5934   /* Handle purely internal nodes.  */
5935   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5936     {
5937       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5938         return false;
5939
5940       stmt_vec_info slp_stmt_info;
5941       unsigned int i;
5942       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5943         {
5944           if (STMT_VINFO_LIVE_P (slp_stmt_info)
5945               && !vectorizable_live_operation (vinfo,
5946                                                slp_stmt_info, NULL, node,
5947                                                node_instance, i,
5948                                                false, cost_vec))
5949             return false;
5950         }
5951       return true;
5952     }
5953
5954   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
5955
5956   bool dummy;
5957   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5958                             node, node_instance, cost_vec);
5959 }
5960
5961 /* Try to build NODE from scalars, returning true on success.
5962    NODE_INSTANCE is the SLP instance that contains NODE.  */
5963
5964 static bool
5965 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5966                               slp_instance node_instance)
5967 {
5968   stmt_vec_info stmt_info;
5969   unsigned int i;
5970
5971   if (!is_a <bb_vec_info> (vinfo)
5972       || node == SLP_INSTANCE_TREE (node_instance)
5973       || !SLP_TREE_SCALAR_STMTS (node).exists ()
5974       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5975       /* Force the mask use to be built from scalars instead.  */
5976       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5977     return false;
5978
5979   if (dump_enabled_p ())
5980     dump_printf_loc (MSG_NOTE, vect_location,
5981                      "Building vector operands of %p from scalars instead\n",
5982                      (void *) node);
5983
5984   /* Don't remove and free the child nodes here, since they could be
5985      referenced by other structures.  The analysis and scheduling phases
5986      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
5987   unsigned int group_size = SLP_TREE_LANES (node);
5988   SLP_TREE_DEF_TYPE (node) = vect_external_def;
5989   /* Invariants get their vector type from the uses.  */
5990   SLP_TREE_VECTYPE (node) = NULL_TREE;
5991   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5992   SLP_TREE_LOAD_PERMUTATION (node).release ();
5993   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5994     {
5995       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5996       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5997     }
5998   return true;
5999 }
6000
6001 /* Return true if all elements of the slice are the same.  */
6002 bool
6003 vect_scalar_ops_slice::all_same_p () const
6004 {
6005   for (unsigned int i = 1; i < length; ++i)
6006     if (!operand_equal_p (op (0), op (i)))
6007       return false;
6008   return true;
6009 }
6010
6011 hashval_t
6012 vect_scalar_ops_slice_hash::hash (const value_type &s)
6013 {
6014   hashval_t hash = 0;
6015   for (unsigned i = 0; i < s.length; ++i)
6016     hash = iterative_hash_expr (s.op (i), hash);
6017   return hash;
6018 }
6019
6020 bool
6021 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6022                                    const compare_type &s2)
6023 {
6024   if (s1.length != s2.length)
6025     return false;
6026   for (unsigned i = 0; i < s1.length; ++i)
6027     if (!operand_equal_p (s1.op (i), s2.op (i)))
6028       return false;
6029   return true;
6030 }
6031
6032 /* Compute the prologue cost for invariant or constant operands represented
6033    by NODE.  */
6034
6035 static void
6036 vect_prologue_cost_for_slp (slp_tree node,
6037                             stmt_vector_for_cost *cost_vec)
6038 {
6039   /* There's a special case of an existing vector, that costs nothing.  */
6040   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6041       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6042     return;
6043   /* Without looking at the actual initializer a vector of
6044      constants can be implemented as load from the constant pool.
6045      When all elements are the same we can use a splat.  */
6046   tree vectype = SLP_TREE_VECTYPE (node);
6047   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6048   unsigned HOST_WIDE_INT const_nunits;
6049   unsigned nelt_limit;
6050   auto ops = &SLP_TREE_SCALAR_OPS (node);
6051   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6052   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6053       && ! multiple_p (const_nunits, group_size))
6054     {
6055       nelt_limit = const_nunits;
6056       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6057       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6058         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6059           starts.quick_push (i * const_nunits);
6060     }
6061   else
6062     {
6063       /* If either the vector has variable length or the vectors
6064          are composed of repeated whole groups we only need to
6065          cost construction once.  All vectors will be the same.  */
6066       nelt_limit = group_size;
6067       starts.quick_push (0);
6068     }
6069   /* ???  We're just tracking whether vectors in a single node are the same.
6070      Ideally we'd do something more global.  */
6071   for (unsigned int start : starts)
6072     {
6073       vect_cost_for_stmt kind;
6074       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6075         kind = vector_load;
6076       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6077         kind = scalar_to_vec;
6078       else
6079         kind = vec_construct;
6080       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
6081     }
6082 }
6083
6084 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6085    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6086
6087    Return true if the operations are supported.  */
6088
6089 static bool
6090 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6091                                   slp_instance node_instance,
6092                                   hash_set<slp_tree> &visited_set,
6093                                   vec<slp_tree> &visited_vec,
6094                                   stmt_vector_for_cost *cost_vec)
6095 {
6096   int i, j;
6097   slp_tree child;
6098
6099   /* Assume we can code-generate all invariants.  */
6100   if (!node
6101       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6102       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6103     return true;
6104
6105   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6106     {
6107       if (dump_enabled_p ())
6108         dump_printf_loc (MSG_NOTE, vect_location,
6109                          "Failed cyclic SLP reference in %p\n", (void *) node);
6110       return false;
6111     }
6112   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6113
6114   /* If we already analyzed the exact same set of scalar stmts we're done.
6115      We share the generated vector stmts for those.  */
6116   if (visited_set.add (node))
6117     return true;
6118   visited_vec.safe_push (node);
6119
6120   bool res = true;
6121   unsigned visited_rec_start = visited_vec.length ();
6122   unsigned cost_vec_rec_start = cost_vec->length ();
6123   bool seen_non_constant_child = false;
6124   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6125     {
6126       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6127                                               visited_set, visited_vec,
6128                                               cost_vec);
6129       if (!res)
6130         break;
6131       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6132         seen_non_constant_child = true;
6133     }
6134   /* We're having difficulties scheduling nodes with just constant
6135      operands and no scalar stmts since we then cannot compute a stmt
6136      insertion place.  */
6137   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6138     {
6139       if (dump_enabled_p ())
6140         dump_printf_loc (MSG_NOTE, vect_location,
6141                          "Cannot vectorize all-constant op node %p\n",
6142                          (void *) node);
6143       res = false;
6144     }
6145
6146   if (res)
6147     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6148                                               cost_vec);
6149   /* If analysis failed we have to pop all recursive visited nodes
6150      plus ourselves.  */
6151   if (!res)
6152     {
6153       while (visited_vec.length () >= visited_rec_start)
6154         visited_set.remove (visited_vec.pop ());
6155       cost_vec->truncate (cost_vec_rec_start);
6156     }
6157
6158   /* When the node can be vectorized cost invariant nodes it references.
6159      This is not done in DFS order to allow the refering node
6160      vectorizable_* calls to nail down the invariant nodes vector type
6161      and possibly unshare it if it needs a different vector type than
6162      other referrers.  */
6163   if (res)
6164     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6165       if (child
6166           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6167               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6168           /* Perform usual caching, note code-generation still
6169              code-gens these nodes multiple times but we expect
6170              to CSE them later.  */
6171           && !visited_set.add (child))
6172         {
6173           visited_vec.safe_push (child);
6174           /* ???  After auditing more code paths make a "default"
6175              and push the vector type from NODE to all children
6176              if it is not already set.  */
6177           /* Compute the number of vectors to be generated.  */
6178           tree vector_type = SLP_TREE_VECTYPE (child);
6179           if (!vector_type)
6180             {
6181               /* For shifts with a scalar argument we don't need
6182                  to cost or code-generate anything.
6183                  ???  Represent this more explicitely.  */
6184               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6185                            == shift_vec_info_type)
6186                           && j == 1);
6187               continue;
6188             }
6189           unsigned group_size = SLP_TREE_LANES (child);
6190           poly_uint64 vf = 1;
6191           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6192             vf = loop_vinfo->vectorization_factor;
6193           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6194             = vect_get_num_vectors (vf * group_size, vector_type);
6195           /* And cost them.  */
6196           vect_prologue_cost_for_slp (child, cost_vec);
6197         }
6198
6199   /* If this node or any of its children can't be vectorized, try pruning
6200      the tree here rather than felling the whole thing.  */
6201   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6202     {
6203       /* We'll need to revisit this for invariant costing and number
6204          of vectorized stmt setting.   */
6205       res = true;
6206     }
6207
6208   return res;
6209 }
6210
6211 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6212    region and that can be vectorized using vectorizable_live_operation
6213    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6214    scalar code computing it to be retained.  */
6215
6216 static void
6217 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6218                              slp_instance instance,
6219                              stmt_vector_for_cost *cost_vec,
6220                              hash_set<stmt_vec_info> &svisited,
6221                              hash_set<slp_tree> &visited)
6222 {
6223   if (visited.add (node))
6224     return;
6225
6226   unsigned i;
6227   stmt_vec_info stmt_info;
6228   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6229   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6230     {
6231       if (svisited.contains (stmt_info))
6232         continue;
6233       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6234       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6235           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6236         /* Only the pattern root stmt computes the original scalar value.  */
6237         continue;
6238       bool mark_visited = true;
6239       gimple *orig_stmt = orig_stmt_info->stmt;
6240       ssa_op_iter op_iter;
6241       def_operand_p def_p;
6242       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6243         {
6244           imm_use_iterator use_iter;
6245           gimple *use_stmt;
6246           stmt_vec_info use_stmt_info;
6247           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6248             if (!is_gimple_debug (use_stmt))
6249               {
6250                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6251                 if (!use_stmt_info
6252                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6253                   {
6254                     STMT_VINFO_LIVE_P (stmt_info) = true;
6255                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6256                                                      NULL, node, instance, i,
6257                                                      false, cost_vec))
6258                       /* ???  So we know we can vectorize the live stmt
6259                          from one SLP node.  If we cannot do so from all
6260                          or none consistently we'd have to record which
6261                          SLP node (and lane) we want to use for the live
6262                          operation.  So make sure we can code-generate
6263                          from all nodes.  */
6264                       mark_visited = false;
6265                     else
6266                       STMT_VINFO_LIVE_P (stmt_info) = false;
6267                     break;
6268                   }
6269               }
6270           /* We have to verify whether we can insert the lane extract
6271              before all uses.  The following is a conservative approximation.
6272              We cannot put this into vectorizable_live_operation because
6273              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6274              doesn't work.
6275              Note that while the fact that we emit code for loads at the
6276              first load should make this a non-problem leafs we construct
6277              from scalars are vectorized after the last scalar def.
6278              ???  If we'd actually compute the insert location during
6279              analysis we could use sth less conservative than the last
6280              scalar stmt in the node for the dominance check.  */
6281           /* ???  What remains is "live" uses in vector CTORs in the same
6282              SLP graph which is where those uses can end up code-generated
6283              right after their definition instead of close to their original
6284              use.  But that would restrict us to code-generate lane-extracts
6285              from the latest stmt in a node.  So we compensate for this
6286              during code-generation, simply not replacing uses for those
6287              hopefully rare cases.  */
6288           if (STMT_VINFO_LIVE_P (stmt_info))
6289             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6290               if (!is_gimple_debug (use_stmt)
6291                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6292                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6293                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6294                 {
6295                   if (dump_enabled_p ())
6296                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297                                      "Cannot determine insertion place for "
6298                                      "lane extract\n");
6299                   STMT_VINFO_LIVE_P (stmt_info) = false;
6300                   mark_visited = true;
6301                 }
6302         }
6303       if (mark_visited)
6304         svisited.add (stmt_info);
6305     }
6306
6307   slp_tree child;
6308   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6309     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6310       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6311                                    cost_vec, svisited, visited);
6312 }
6313
6314 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6315
6316 static bool
6317 vectorizable_bb_reduc_epilogue (slp_instance instance,
6318                                 stmt_vector_for_cost *cost_vec)
6319 {
6320   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6321   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6322   if (reduc_code == MINUS_EXPR)
6323     reduc_code = PLUS_EXPR;
6324   internal_fn reduc_fn;
6325   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6326   if (!vectype
6327       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6328       || reduc_fn == IFN_LAST
6329       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6330       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6331                                      TREE_TYPE (vectype)))
6332     return false;
6333
6334   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6335      cost log2 vector operations plus shuffles and one extraction.  */
6336   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6337   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6338                     vectype, 0, vect_body);
6339   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6340                     vectype, 0, vect_body);
6341   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6342                     vectype, 0, vect_body);
6343   return true;
6344 }
6345
6346 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6347    and recurse to children.  */
6348
6349 static void
6350 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6351                               hash_set<slp_tree> &visited)
6352 {
6353   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6354       || visited.add (node))
6355     return;
6356
6357   stmt_vec_info stmt;
6358   unsigned i;
6359   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6360     roots.remove (vect_orig_stmt (stmt));
6361
6362   slp_tree child;
6363   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6364     if (child)
6365       vect_slp_prune_covered_roots (child, roots, visited);
6366 }
6367
6368 /* Analyze statements in SLP instances of VINFO.  Return true if the
6369    operations are supported. */
6370
6371 bool
6372 vect_slp_analyze_operations (vec_info *vinfo)
6373 {
6374   slp_instance instance;
6375   int i;
6376
6377   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6378
6379   hash_set<slp_tree> visited;
6380   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6381     {
6382       auto_vec<slp_tree> visited_vec;
6383       stmt_vector_for_cost cost_vec;
6384       cost_vec.create (2);
6385       if (is_a <bb_vec_info> (vinfo))
6386         vect_location = instance->location ();
6387       if (!vect_slp_analyze_node_operations (vinfo,
6388                                              SLP_INSTANCE_TREE (instance),
6389                                              instance, visited, visited_vec,
6390                                              &cost_vec)
6391           /* CTOR instances require vectorized defs for the SLP tree root.  */
6392           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6393               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6394                   != vect_internal_def
6395                   /* Make sure we vectorized with the expected type.  */
6396                   || !useless_type_conversion_p
6397                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6398                                               (instance->root_stmts[0]->stmt))),
6399                          TREE_TYPE (SLP_TREE_VECTYPE
6400                                             (SLP_INSTANCE_TREE (instance))))))
6401           /* Check we can vectorize the reduction.  */
6402           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6403               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6404         {
6405           slp_tree node = SLP_INSTANCE_TREE (instance);
6406           stmt_vec_info stmt_info;
6407           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6408             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6409           else
6410             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6411           if (dump_enabled_p ())
6412             dump_printf_loc (MSG_NOTE, vect_location,
6413                              "removing SLP instance operations starting from: %G",
6414                              stmt_info->stmt);
6415           vect_free_slp_instance (instance);
6416           vinfo->slp_instances.ordered_remove (i);
6417           cost_vec.release ();
6418           while (!visited_vec.is_empty ())
6419             visited.remove (visited_vec.pop ());
6420         }
6421       else
6422         {
6423           i++;
6424           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6425             {
6426               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6427               cost_vec.release ();
6428             }
6429           else
6430             /* For BB vectorization remember the SLP graph entry
6431                cost for later.  */
6432             instance->cost_vec = cost_vec;
6433         }
6434     }
6435
6436   /* Now look for SLP instances with a root that are covered by other
6437      instances and remove them.  */
6438   hash_set<stmt_vec_info> roots;
6439   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6440     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6441       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6442   if (!roots.is_empty ())
6443     {
6444       visited.empty ();
6445       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6446         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6447                                       visited);
6448       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6449         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6450             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6451           {
6452             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6453             if (dump_enabled_p ())
6454               dump_printf_loc (MSG_NOTE, vect_location,
6455                                "removing SLP instance operations starting "
6456                                "from: %G", root->stmt);
6457             vect_free_slp_instance (instance);
6458             vinfo->slp_instances.ordered_remove (i);
6459           }
6460         else
6461           ++i;
6462     }
6463
6464   /* Compute vectorizable live stmts.  */
6465   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6466     {
6467       hash_set<stmt_vec_info> svisited;
6468       hash_set<slp_tree> visited;
6469       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6470         {
6471           vect_location = instance->location ();
6472           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6473                                        instance, &instance->cost_vec, svisited,
6474                                        visited);
6475         }
6476     }
6477
6478   return !vinfo->slp_instances.is_empty ();
6479 }
6480
6481 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6482    closing the eventual chain.  */
6483
6484 static slp_instance
6485 get_ultimate_leader (slp_instance instance,
6486                      hash_map<slp_instance, slp_instance> &instance_leader)
6487 {
6488   auto_vec<slp_instance *, 8> chain;
6489   slp_instance *tem;
6490   while (*(tem = instance_leader.get (instance)) != instance)
6491     {
6492       chain.safe_push (tem);
6493       instance = *tem;
6494     }
6495   while (!chain.is_empty ())
6496     *chain.pop () = instance;
6497   return instance;
6498 }
6499
6500 namespace {
6501 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6502    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6503    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6504
6505    INSTANCE_LEADER is as for get_ultimate_leader.  */
6506
6507 template<typename T>
6508 bool
6509 vect_map_to_instance (slp_instance instance, T key,
6510                       hash_map<T, slp_instance> &key_to_instance,
6511                       hash_map<slp_instance, slp_instance> &instance_leader)
6512 {
6513   bool existed_p;
6514   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6515   if (!existed_p)
6516     ;
6517   else if (key_instance != instance)
6518     {
6519       /* If we're running into a previously marked key make us the
6520          leader of the current ultimate leader.  This keeps the
6521          leader chain acyclic and works even when the current instance
6522          connects two previously independent graph parts.  */
6523       slp_instance key_leader
6524         = get_ultimate_leader (key_instance, instance_leader);
6525       if (key_leader != instance)
6526         instance_leader.put (key_leader, instance);
6527     }
6528   key_instance = instance;
6529   return existed_p;
6530 }
6531 }
6532
6533 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6534
6535 static void
6536 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6537                            slp_instance instance, slp_tree node,
6538                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6539                            hash_map<slp_tree, slp_instance> &node_to_instance,
6540                            hash_map<slp_instance, slp_instance> &instance_leader)
6541 {
6542   stmt_vec_info stmt_info;
6543   unsigned i;
6544
6545   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6546     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6547                           instance_leader);
6548
6549   if (vect_map_to_instance (instance, node, node_to_instance,
6550                             instance_leader))
6551     return;
6552
6553   slp_tree child;
6554   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6555     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6556       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6557                                  node_to_instance, instance_leader);
6558 }
6559
6560 /* Partition the SLP graph into pieces that can be costed independently.  */
6561
6562 static void
6563 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6564 {
6565   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6566
6567   /* First walk the SLP graph assigning each involved scalar stmt a
6568      corresponding SLP graph entry and upon visiting a previously
6569      marked stmt, make the stmts leader the current SLP graph entry.  */
6570   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6571   hash_map<slp_tree, slp_instance> node_to_instance;
6572   hash_map<slp_instance, slp_instance> instance_leader;
6573   slp_instance instance;
6574   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6575     {
6576       instance_leader.put (instance, instance);
6577       vect_bb_partition_graph_r (bb_vinfo,
6578                                  instance, SLP_INSTANCE_TREE (instance),
6579                                  stmt_to_instance, node_to_instance,
6580                                  instance_leader);
6581     }
6582
6583   /* Then collect entries to each independent subgraph.  */
6584   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6585     {
6586       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6587       leader->subgraph_entries.safe_push (instance);
6588       if (dump_enabled_p ()
6589           && leader != instance)
6590         dump_printf_loc (MSG_NOTE, vect_location,
6591                          "instance %p is leader of %p\n",
6592                          (void *) leader, (void *) instance);
6593     }
6594 }
6595
6596 /* Compute the set of scalar stmts participating in internal and external
6597    nodes.  */
6598
6599 static void
6600 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6601                                          hash_set<slp_tree> &visited,
6602                                          hash_set<stmt_vec_info> &vstmts,
6603                                          hash_set<stmt_vec_info> &estmts)
6604 {
6605   int i;
6606   stmt_vec_info stmt_info;
6607   slp_tree child;
6608
6609   if (visited.add (node))
6610     return;
6611
6612   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6613     {
6614       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6615         vstmts.add (stmt_info);
6616
6617       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6618         if (child)
6619           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6620                                                    vstmts, estmts);
6621     }
6622   else
6623     for (tree def : SLP_TREE_SCALAR_OPS (node))
6624       {
6625         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6626         if (def_stmt)
6627           estmts.add (def_stmt);
6628       }
6629 }
6630
6631
6632 /* Compute the scalar cost of the SLP node NODE and its children
6633    and return it.  Do not account defs that are marked in LIFE and
6634    update LIFE according to uses of NODE.  */
6635
6636 static void
6637 vect_bb_slp_scalar_cost (vec_info *vinfo,
6638                          slp_tree node, vec<bool, va_heap> *life,
6639                          stmt_vector_for_cost *cost_vec,
6640                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6641                          hash_set<slp_tree> &visited)
6642 {
6643   unsigned i;
6644   stmt_vec_info stmt_info;
6645   slp_tree child;
6646
6647   if (visited.add (node))
6648     return;
6649
6650   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6651     {
6652       ssa_op_iter op_iter;
6653       def_operand_p def_p;
6654
6655       if ((*life)[i])
6656         continue;
6657
6658       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6659       gimple *orig_stmt = orig_stmt_info->stmt;
6660
6661       /* If there is a non-vectorized use of the defs then the scalar
6662          stmt is kept live in which case we do not account it or any
6663          required defs in the SLP children in the scalar cost.  This
6664          way we make the vectorization more costly when compared to
6665          the scalar cost.  */
6666       if (!STMT_VINFO_LIVE_P (stmt_info))
6667         {
6668           auto_vec<gimple *, 8> worklist;
6669           hash_set<gimple *> *worklist_visited = NULL;
6670           worklist.quick_push (orig_stmt);
6671           do
6672             {
6673               gimple *work_stmt = worklist.pop ();
6674               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6675                 {
6676                   imm_use_iterator use_iter;
6677                   gimple *use_stmt;
6678                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6679                                          DEF_FROM_PTR (def_p))
6680                     if (!is_gimple_debug (use_stmt))
6681                       {
6682                         stmt_vec_info use_stmt_info
6683                           = vinfo->lookup_stmt (use_stmt);
6684                         if (!use_stmt_info
6685                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6686                           {
6687                             if (use_stmt_info
6688                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6689                               {
6690                                 /* For stmts participating in patterns we have
6691                                    to check its uses recursively.  */
6692                                 if (!worklist_visited)
6693                                   worklist_visited = new hash_set<gimple *> ();
6694                                 if (!worklist_visited->add (use_stmt))
6695                                   worklist.safe_push (use_stmt);
6696                                 continue;
6697                               }
6698                             (*life)[i] = true;
6699                             goto next_lane;
6700                           }
6701                       }
6702                 }
6703             }
6704           while (!worklist.is_empty ());
6705 next_lane:
6706           if (worklist_visited)
6707             delete worklist_visited;
6708           if ((*life)[i])
6709             continue;
6710         }
6711
6712       /* Count scalar stmts only once.  */
6713       if (gimple_visited_p (orig_stmt))
6714         continue;
6715       gimple_set_visited (orig_stmt, true);
6716
6717       vect_cost_for_stmt kind;
6718       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6719         {
6720           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6721             kind = scalar_load;
6722           else
6723             kind = scalar_store;
6724         }
6725       else if (vect_nop_conversion_p (orig_stmt_info))
6726         continue;
6727       /* For single-argument PHIs assume coalescing which means zero cost
6728          for the scalar and the vector PHIs.  This avoids artificially
6729          favoring the vector path (but may pessimize it in some cases).  */
6730       else if (is_a <gphi *> (orig_stmt_info->stmt)
6731                && gimple_phi_num_args
6732                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6733         continue;
6734       else
6735         kind = scalar_stmt;
6736       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6737                         SLP_TREE_VECTYPE (node), 0, vect_body);
6738     }
6739
6740   auto_vec<bool, 20> subtree_life;
6741   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6742     {
6743       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6744         {
6745           /* Do not directly pass LIFE to the recursive call, copy it to
6746              confine changes in the callee to the current child/subtree.  */
6747           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6748             {
6749               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6750               for (unsigned j = 0;
6751                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6752                 {
6753                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6754                   if (perm.first == i)
6755                     subtree_life[perm.second] = (*life)[j];
6756                 }
6757             }
6758           else
6759             {
6760               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6761               subtree_life.safe_splice (*life);
6762             }
6763           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6764                                    vectorized_scalar_stmts, visited);
6765           subtree_life.truncate (0);
6766         }
6767     }
6768 }
6769
6770 /* Comparator for the loop-index sorted cost vectors.  */
6771
6772 static int
6773 li_cost_vec_cmp (const void *a_, const void *b_)
6774 {
6775   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6776   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6777   if (a->first < b->first)
6778     return -1;
6779   else if (a->first == b->first)
6780     return 0;
6781   return 1;
6782 }
6783
6784 /* Check if vectorization of the basic block is profitable for the
6785    subgraph denoted by SLP_INSTANCES.  */
6786
6787 static bool
6788 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6789                                     vec<slp_instance> slp_instances,
6790                                     loop_p orig_loop)
6791 {
6792   slp_instance instance;
6793   int i;
6794   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6795   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6796
6797   if (dump_enabled_p ())
6798     {
6799       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6800       hash_set<slp_tree> visited;
6801       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6802         vect_print_slp_graph (MSG_NOTE, vect_location,
6803                               SLP_INSTANCE_TREE (instance), visited);
6804     }
6805
6806   /* Compute the set of scalar stmts we know will go away 'locally' when
6807      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6808      not accurate for nodes promoted extern late or for scalar stmts that
6809      are used both in extern defs and in vectorized defs.  */
6810   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6811   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6812   hash_set<slp_tree> visited;
6813   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6814     {
6815       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6816                                                SLP_INSTANCE_TREE (instance),
6817                                                visited,
6818                                                vectorized_scalar_stmts,
6819                                                scalar_stmts_in_externs);
6820       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6821         vectorized_scalar_stmts.add (rstmt);
6822     }
6823   /* Scalar stmts used as defs in external nodes need to be preseved, so
6824      remove them from vectorized_scalar_stmts.  */
6825   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6826     vectorized_scalar_stmts.remove (stmt);
6827
6828   /* Calculate scalar cost and sum the cost for the vector stmts
6829      previously collected.  */
6830   stmt_vector_for_cost scalar_costs = vNULL;
6831   stmt_vector_for_cost vector_costs = vNULL;
6832   visited.empty ();
6833   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6834     {
6835       auto_vec<bool, 20> life;
6836       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6837                               true);
6838       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6839         record_stmt_cost (&scalar_costs,
6840                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6841                           scalar_stmt,
6842                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6843       vect_bb_slp_scalar_cost (bb_vinfo,
6844                                SLP_INSTANCE_TREE (instance),
6845                                &life, &scalar_costs, vectorized_scalar_stmts,
6846                                visited);
6847       vector_costs.safe_splice (instance->cost_vec);
6848       instance->cost_vec.release ();
6849     }
6850
6851   if (dump_enabled_p ())
6852     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6853
6854   /* When costing non-loop vectorization we need to consider each covered
6855      loop independently and make sure vectorization is profitable.  For
6856      now we assume a loop may be not entered or executed an arbitrary
6857      number of iterations (???  static information can provide more
6858      precise info here) which means we can simply cost each containing
6859      loops stmts separately.  */
6860
6861   /* First produce cost vectors sorted by loop index.  */
6862   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6863     li_scalar_costs (scalar_costs.length ());
6864   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6865     li_vector_costs (vector_costs.length ());
6866   stmt_info_for_cost *cost;
6867   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6868     {
6869       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6870       li_scalar_costs.quick_push (std::make_pair (l, cost));
6871     }
6872   /* Use a random used loop as fallback in case the first vector_costs
6873      entry does not have a stmt_info associated with it.  */
6874   unsigned l = li_scalar_costs[0].first;
6875   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6876     {
6877       /* We inherit from the previous COST, invariants, externals and
6878          extracts immediately follow the cost for the related stmt.  */
6879       if (cost->stmt_info)
6880         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6881       li_vector_costs.quick_push (std::make_pair (l, cost));
6882     }
6883   li_scalar_costs.qsort (li_cost_vec_cmp);
6884   li_vector_costs.qsort (li_cost_vec_cmp);
6885
6886   /* Now cost the portions individually.  */
6887   unsigned vi = 0;
6888   unsigned si = 0;
6889   bool profitable = true;
6890   while (si < li_scalar_costs.length ()
6891          && vi < li_vector_costs.length ())
6892     {
6893       unsigned sl = li_scalar_costs[si].first;
6894       unsigned vl = li_vector_costs[vi].first;
6895       if (sl != vl)
6896         {
6897           if (dump_enabled_p ())
6898             dump_printf_loc (MSG_NOTE, vect_location,
6899                              "Scalar %d and vector %d loop part do not "
6900                              "match up, skipping scalar part\n", sl, vl);
6901           /* Skip the scalar part, assuming zero cost on the vector side.  */
6902           do
6903             {
6904               si++;
6905             }
6906           while (si < li_scalar_costs.length ()
6907                  && li_scalar_costs[si].first == sl);
6908           continue;
6909         }
6910
6911       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6912       do
6913         {
6914           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6915           si++;
6916         }
6917       while (si < li_scalar_costs.length ()
6918              && li_scalar_costs[si].first == sl);
6919       unsigned dummy;
6920       finish_cost (scalar_target_cost_data, nullptr,
6921                    &dummy, &scalar_cost, &dummy);
6922
6923       /* Complete the target-specific vector cost calculation.  */
6924       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6925       do
6926         {
6927           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6928           vi++;
6929         }
6930       while (vi < li_vector_costs.length ()
6931              && li_vector_costs[vi].first == vl);
6932       finish_cost (vect_target_cost_data, scalar_target_cost_data,
6933                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6934       delete scalar_target_cost_data;
6935       delete vect_target_cost_data;
6936
6937       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6938
6939       if (dump_enabled_p ())
6940         {
6941           dump_printf_loc (MSG_NOTE, vect_location,
6942                            "Cost model analysis for part in loop %d:\n", sl);
6943           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
6944                        vec_inside_cost + vec_outside_cost);
6945           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
6946         }
6947
6948       /* Vectorization is profitable if its cost is more than the cost of scalar
6949          version.  Note that we err on the vector side for equal cost because
6950          the cost estimate is otherwise quite pessimistic (constant uses are
6951          free on the scalar side but cost a load on the vector side for
6952          example).  */
6953       if (vec_outside_cost + vec_inside_cost > scalar_cost)
6954         {
6955           profitable = false;
6956           break;
6957         }
6958     }
6959   if (profitable && vi < li_vector_costs.length ())
6960     {
6961       if (dump_enabled_p ())
6962         dump_printf_loc (MSG_NOTE, vect_location,
6963                          "Excess vector cost for part in loop %d:\n",
6964                          li_vector_costs[vi].first);
6965       profitable = false;
6966     }
6967
6968   /* Unset visited flag.  This is delayed when the subgraph is profitable
6969      and we process the loop for remaining unvectorized if-converted code.  */
6970   if (!orig_loop || !profitable)
6971     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6972       gimple_set_visited  (cost->stmt_info->stmt, false);
6973
6974   scalar_costs.release ();
6975   vector_costs.release ();
6976
6977   return profitable;
6978 }
6979
6980 /* qsort comparator for lane defs.  */
6981
6982 static int
6983 vld_cmp (const void *a_, const void *b_)
6984 {
6985   auto *a = (const std::pair<unsigned, tree> *)a_;
6986   auto *b = (const std::pair<unsigned, tree> *)b_;
6987   return a->first - b->first;
6988 }
6989
6990 /* Return true if USE_STMT is a vector lane insert into VEC and set
6991    *THIS_LANE to the lane number that is set.  */
6992
6993 static bool
6994 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
6995 {
6996   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
6997   if (!use_ass
6998       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
6999       || (vec
7000           ? gimple_assign_rhs1 (use_ass) != vec
7001           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7002       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7003                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7004       || !constant_multiple_p
7005             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7006              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7007              this_lane))
7008     return false;
7009   return true;
7010 }
7011
7012 /* Find any vectorizable constructors and add them to the grouped_store
7013    array.  */
7014
7015 static void
7016 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
7017 {
7018   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7019     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7020          !gsi_end_p (gsi); gsi_next (&gsi))
7021     {
7022       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7023       if (!assign)
7024         continue;
7025
7026       tree rhs = gimple_assign_rhs1 (assign);
7027       enum tree_code code = gimple_assign_rhs_code (assign);
7028       use_operand_p use_p;
7029       gimple *use_stmt;
7030       if (code == CONSTRUCTOR)
7031         {
7032           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7033               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7034                            CONSTRUCTOR_NELTS (rhs))
7035               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7036               || uniform_vector_p (rhs))
7037             continue;
7038
7039           unsigned j;
7040           tree val;
7041           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7042               if (TREE_CODE (val) != SSA_NAME
7043                   || !bb_vinfo->lookup_def (val))
7044                 break;
7045           if (j != CONSTRUCTOR_NELTS (rhs))
7046             continue;
7047
7048           stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7049           BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7050         }
7051       else if (code == BIT_INSERT_EXPR
7052                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7053                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7054                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7055                && integer_zerop (gimple_assign_rhs3 (assign))
7056                && useless_type_conversion_p
7057                     (TREE_TYPE (TREE_TYPE (rhs)),
7058                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7059                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7060         {
7061           /* We start to match on insert to lane zero but since the
7062              inserts need not be ordered we'd have to search both
7063              the def and the use chains.  */
7064           tree vectype = TREE_TYPE (rhs);
7065           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7066           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7067           auto_sbitmap lanes (nlanes);
7068           bitmap_clear (lanes);
7069           bitmap_set_bit (lanes, 0);
7070           tree def = gimple_assign_lhs (assign);
7071           lane_defs.quick_push
7072                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7073           unsigned lanes_found = 1;
7074           /* Start with the use chains, the last stmt will be the root.  */
7075           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7076           vec<stmt_vec_info> roots = vNULL;
7077           roots.safe_push (last);
7078           do
7079             {
7080               use_operand_p use_p;
7081               gimple *use_stmt;
7082               if (!single_imm_use (def, &use_p, &use_stmt))
7083                 break;
7084               unsigned this_lane;
7085               if (!bb_vinfo->lookup_stmt (use_stmt)
7086                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7087                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7088                 break;
7089               if (bitmap_bit_p (lanes, this_lane))
7090                 break;
7091               lanes_found++;
7092               bitmap_set_bit (lanes, this_lane);
7093               gassign *use_ass = as_a <gassign *> (use_stmt);
7094               lane_defs.quick_push (std::make_pair
7095                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7096               last = bb_vinfo->lookup_stmt (use_ass);
7097               roots.safe_push (last);
7098               def = gimple_assign_lhs (use_ass);
7099             }
7100           while (lanes_found < nlanes);
7101           if (roots.length () > 1)
7102             std::swap(roots[0], roots[roots.length () - 1]);
7103           if (lanes_found < nlanes)
7104             {
7105               /* Now search the def chain.  */
7106               def = gimple_assign_rhs1 (assign);
7107               do
7108                 {
7109                   if (TREE_CODE (def) != SSA_NAME
7110                       || !has_single_use (def))
7111                     break;
7112                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7113                   unsigned this_lane;
7114                   if (!bb_vinfo->lookup_stmt (def_stmt)
7115                       || !vect_slp_is_lane_insert (def_stmt,
7116                                                    NULL_TREE, &this_lane)
7117                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7118                     break;
7119                   if (bitmap_bit_p (lanes, this_lane))
7120                     break;
7121                   lanes_found++;
7122                   bitmap_set_bit (lanes, this_lane);
7123                   lane_defs.quick_push (std::make_pair
7124                                           (this_lane,
7125                                            gimple_assign_rhs2 (def_stmt)));
7126                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7127                   def = gimple_assign_rhs1 (def_stmt);
7128                 }
7129               while (lanes_found < nlanes);
7130             }
7131           if (lanes_found == nlanes)
7132             {
7133               /* Sort lane_defs after the lane index and register the root.  */
7134               lane_defs.qsort (vld_cmp);
7135               vec<stmt_vec_info> stmts;
7136               stmts.create (nlanes);
7137               for (unsigned i = 0; i < nlanes; ++i)
7138                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7139               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7140                                                    stmts, roots));
7141             }
7142           else
7143             roots.release ();
7144         }
7145       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7146                && (associative_tree_code (code) || code == MINUS_EXPR)
7147                /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
7148                   checks pessimize a two-element reduction.  PR54400.
7149                   ???  In-order reduction could be handled if we only
7150                   traverse one operand chain in vect_slp_linearize_chain.  */
7151                && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7152                    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7153                        && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7154                /* Ops with constants at the tail can be stripped here.  */
7155                && TREE_CODE (rhs) == SSA_NAME
7156                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7157                /* Should be the chain end.  */
7158                && (!single_imm_use (gimple_assign_lhs (assign),
7159                                     &use_p, &use_stmt)
7160                    || !is_gimple_assign (use_stmt)
7161                    || (gimple_assign_rhs_code (use_stmt) != code
7162                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7163                            || (gimple_assign_rhs_code (use_stmt)
7164                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7165         {
7166           /* We start the match at the end of a possible association
7167              chain.  */
7168           auto_vec<chain_op_t> chain;
7169           auto_vec<std::pair<tree_code, gimple *> > worklist;
7170           auto_vec<gimple *> chain_stmts;
7171           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7172           if (code == MINUS_EXPR)
7173             code = PLUS_EXPR;
7174           internal_fn reduc_fn;
7175           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7176               || reduc_fn == IFN_LAST)
7177             continue;
7178           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7179                                     /* ??? */
7180                                     code_stmt, alt_code_stmt, &chain_stmts);
7181           if (chain.length () > 1)
7182             {
7183               /* Sort the chain according to def_type and operation.  */
7184               chain.sort (dt_sort_cmp, bb_vinfo);
7185               /* ???  Now we'd want to strip externals and constants
7186                  but record those to be handled in the epilogue.  */
7187               /* ???  For now do not allow mixing ops or externs/constants.  */
7188               bool invalid = false;
7189               for (unsigned i = 0; i < chain.length (); ++i)
7190                 if (chain[i].dt != vect_internal_def
7191                     || chain[i].code != code)
7192                   invalid = true;
7193               if (!invalid)
7194                 {
7195                   vec<stmt_vec_info> stmts;
7196                   stmts.create (chain.length ());
7197                   for (unsigned i = 0; i < chain.length (); ++i)
7198                     stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7199                   vec<stmt_vec_info> roots;
7200                   roots.create (chain_stmts.length ());
7201                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7202                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7203                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7204                                                        stmts, roots));
7205                 }
7206             }
7207         }
7208     }
7209 }
7210
7211 /* Walk the grouped store chains and replace entries with their
7212    pattern variant if any.  */
7213
7214 static void
7215 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7216 {
7217   stmt_vec_info first_element;
7218   unsigned i;
7219
7220   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7221     {
7222       /* We also have CTORs in this array.  */
7223       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7224         continue;
7225       if (STMT_VINFO_IN_PATTERN_P (first_element))
7226         {
7227           stmt_vec_info orig = first_element;
7228           first_element = STMT_VINFO_RELATED_STMT (first_element);
7229           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7230           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7231           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7232           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7233           vinfo->grouped_stores[i] = first_element;
7234         }
7235       stmt_vec_info prev = first_element;
7236       while (DR_GROUP_NEXT_ELEMENT (prev))
7237         {
7238           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7239           if (STMT_VINFO_IN_PATTERN_P (elt))
7240             {
7241               stmt_vec_info orig = elt;
7242               elt = STMT_VINFO_RELATED_STMT (elt);
7243               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7244               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7245               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7246             }
7247           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7248           prev = elt;
7249         }
7250     }
7251 }
7252
7253 /* Check if the region described by BB_VINFO can be vectorized, returning
7254    true if so.  When returning false, set FATAL to true if the same failure
7255    would prevent vectorization at other vector sizes, false if it is still
7256    worth trying other sizes.  N_STMTS is the number of statements in the
7257    region.  */
7258
7259 static bool
7260 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7261                        vec<int> *dataref_groups)
7262 {
7263   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7264
7265   slp_instance instance;
7266   int i;
7267   poly_uint64 min_vf = 2;
7268
7269   /* The first group of checks is independent of the vector size.  */
7270   fatal = true;
7271
7272   /* Analyze the data references.  */
7273
7274   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7275     {
7276       if (dump_enabled_p ())
7277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278                          "not vectorized: unhandled data-ref in basic "
7279                          "block.\n");
7280       return false;
7281     }
7282
7283   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7284     {
7285      if (dump_enabled_p ())
7286        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287                         "not vectorized: unhandled data access in "
7288                         "basic block.\n");
7289       return false;
7290     }
7291
7292   vect_slp_check_for_constructors (bb_vinfo);
7293
7294   /* If there are no grouped stores and no constructors in the region
7295      there is no need to continue with pattern recog as vect_analyze_slp
7296      will fail anyway.  */
7297   if (bb_vinfo->grouped_stores.is_empty ()
7298       && bb_vinfo->roots.is_empty ())
7299     {
7300       if (dump_enabled_p ())
7301         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7302                          "not vectorized: no grouped stores in "
7303                          "basic block.\n");
7304       return false;
7305     }
7306
7307   /* While the rest of the analysis below depends on it in some way.  */
7308   fatal = false;
7309
7310   vect_pattern_recog (bb_vinfo);
7311
7312   /* Update store groups from pattern processing.  */
7313   vect_fixup_store_groups_with_patterns (bb_vinfo);
7314
7315   /* Check the SLP opportunities in the basic block, analyze and build SLP
7316      trees.  */
7317   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7318     {
7319       if (dump_enabled_p ())
7320         {
7321           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7322                            "Failed to SLP the basic block.\n");
7323           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7324                            "not vectorized: failed to find SLP opportunities "
7325                            "in basic block.\n");
7326         }
7327       return false;
7328     }
7329
7330   /* Optimize permutations.  */
7331   vect_optimize_slp (bb_vinfo);
7332
7333   /* Gather the loads reachable from the SLP graph entries.  */
7334   vect_gather_slp_loads (bb_vinfo);
7335
7336   vect_record_base_alignments (bb_vinfo);
7337
7338   /* Analyze and verify the alignment of data references and the
7339      dependence in the SLP instances.  */
7340   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7341     {
7342       vect_location = instance->location ();
7343       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7344           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7345         {
7346           slp_tree node = SLP_INSTANCE_TREE (instance);
7347           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7348           if (dump_enabled_p ())
7349             dump_printf_loc (MSG_NOTE, vect_location,
7350                              "removing SLP instance operations starting from: %G",
7351                              stmt_info->stmt);
7352           vect_free_slp_instance (instance);
7353           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7354           continue;
7355         }
7356
7357       /* Mark all the statements that we want to vectorize as pure SLP and
7358          relevant.  */
7359       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7360       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7361       unsigned j;
7362       stmt_vec_info root;
7363       /* Likewise consider instance root stmts as vectorized.  */
7364       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7365         STMT_SLP_TYPE (root) = pure_slp;
7366
7367       i++;
7368     }
7369   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7370     return false;
7371
7372   if (!vect_slp_analyze_operations (bb_vinfo))
7373     {
7374       if (dump_enabled_p ())
7375         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7376                          "not vectorized: bad operation in basic block.\n");
7377       return false;
7378     }
7379
7380   vect_bb_partition_graph (bb_vinfo);
7381
7382   return true;
7383 }
7384
7385 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7386    basic blocks in BBS, returning true on success.
7387    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7388
7389 static bool
7390 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7391                  vec<int> *dataref_groups, unsigned int n_stmts,
7392                  loop_p orig_loop)
7393 {
7394   bb_vec_info bb_vinfo;
7395   auto_vector_modes vector_modes;
7396
7397   /* Autodetect first vector size we try.  */
7398   machine_mode next_vector_mode = VOIDmode;
7399   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7400   unsigned int mode_i = 0;
7401
7402   vec_info_shared shared;
7403
7404   machine_mode autodetected_vector_mode = VOIDmode;
7405   while (1)
7406     {
7407       bool vectorized = false;
7408       bool fatal = false;
7409       bb_vinfo = new _bb_vec_info (bbs, &shared);
7410
7411       bool first_time_p = shared.datarefs.is_empty ();
7412       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7413       if (first_time_p)
7414         bb_vinfo->shared->save_datarefs ();
7415       else
7416         bb_vinfo->shared->check_datarefs ();
7417       bb_vinfo->vector_mode = next_vector_mode;
7418
7419       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7420         {
7421           if (dump_enabled_p ())
7422             {
7423               dump_printf_loc (MSG_NOTE, vect_location,
7424                                "***** Analysis succeeded with vector mode"
7425                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7426               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7427             }
7428
7429           bb_vinfo->shared->check_datarefs ();
7430
7431           auto_vec<slp_instance> profitable_subgraphs;
7432           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7433             {
7434               if (instance->subgraph_entries.is_empty ())
7435                 continue;
7436
7437               vect_location = instance->location ();
7438               if (!unlimited_cost_model (NULL)
7439                   && !vect_bb_vectorization_profitable_p
7440                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7441                 {
7442                   if (dump_enabled_p ())
7443                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7444                                      "not vectorized: vectorization is not "
7445                                      "profitable.\n");
7446                   continue;
7447                 }
7448
7449               if (!dbg_cnt (vect_slp))
7450                 continue;
7451
7452               profitable_subgraphs.safe_push (instance);
7453             }
7454
7455           /* When we're vectorizing an if-converted loop body make sure
7456              we vectorized all if-converted code.  */
7457           if (!profitable_subgraphs.is_empty ()
7458               && orig_loop)
7459             {
7460               gcc_assert (bb_vinfo->bbs.length () == 1);
7461               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7462                    !gsi_end_p (gsi); gsi_next (&gsi))
7463                 {
7464                   /* The costing above left us with DCEable vectorized scalar
7465                      stmts having the visited flag set on profitable
7466                      subgraphs.  Do the delayed clearing of the flag here.  */
7467                   if (gimple_visited_p (gsi_stmt (gsi)))
7468                     {
7469                       gimple_set_visited (gsi_stmt (gsi), false);
7470                       continue;
7471                     }
7472                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7473                     continue;
7474
7475                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7476                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7477                       {
7478                         if (!profitable_subgraphs.is_empty ()
7479                             && dump_enabled_p ())
7480                           dump_printf_loc (MSG_NOTE, vect_location,
7481                                            "not profitable because of "
7482                                            "unprofitable if-converted scalar "
7483                                            "code\n");
7484                         profitable_subgraphs.truncate (0);
7485                       }
7486                 }
7487             }
7488
7489           /* Finally schedule the profitable subgraphs.  */
7490           for (slp_instance instance : profitable_subgraphs)
7491             {
7492               if (!vectorized && dump_enabled_p ())
7493                 dump_printf_loc (MSG_NOTE, vect_location,
7494                                  "Basic block will be vectorized "
7495                                  "using SLP\n");
7496               vectorized = true;
7497
7498               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7499
7500               unsigned HOST_WIDE_INT bytes;
7501               if (dump_enabled_p ())
7502                 {
7503                   if (GET_MODE_SIZE
7504                         (bb_vinfo->vector_mode).is_constant (&bytes))
7505                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7506                                      "basic block part vectorized using %wu "
7507                                      "byte vectors\n", bytes);
7508                   else
7509                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7510                                      "basic block part vectorized using "
7511                                      "variable length vectors\n");
7512                 }
7513             }
7514         }
7515       else
7516         {
7517           if (dump_enabled_p ())
7518             dump_printf_loc (MSG_NOTE, vect_location,
7519                              "***** Analysis failed with vector mode %s\n",
7520                              GET_MODE_NAME (bb_vinfo->vector_mode));
7521         }
7522
7523       if (mode_i == 0)
7524         autodetected_vector_mode = bb_vinfo->vector_mode;
7525
7526       if (!fatal)
7527         while (mode_i < vector_modes.length ()
7528                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7529           {
7530             if (dump_enabled_p ())
7531               dump_printf_loc (MSG_NOTE, vect_location,
7532                                "***** The result for vector mode %s would"
7533                                " be the same\n",
7534                                GET_MODE_NAME (vector_modes[mode_i]));
7535             mode_i += 1;
7536           }
7537
7538       delete bb_vinfo;
7539
7540       if (mode_i < vector_modes.length ()
7541           && VECTOR_MODE_P (autodetected_vector_mode)
7542           && (related_vector_mode (vector_modes[mode_i],
7543                                    GET_MODE_INNER (autodetected_vector_mode))
7544               == autodetected_vector_mode)
7545           && (related_vector_mode (autodetected_vector_mode,
7546                                    GET_MODE_INNER (vector_modes[mode_i]))
7547               == vector_modes[mode_i]))
7548         {
7549           if (dump_enabled_p ())
7550             dump_printf_loc (MSG_NOTE, vect_location,
7551                              "***** Skipping vector mode %s, which would"
7552                              " repeat the analysis for %s\n",
7553                              GET_MODE_NAME (vector_modes[mode_i]),
7554                              GET_MODE_NAME (autodetected_vector_mode));
7555           mode_i += 1;
7556         }
7557
7558       if (vectorized
7559           || mode_i == vector_modes.length ()
7560           || autodetected_vector_mode == VOIDmode
7561           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7562              vector sizes will fail do not bother iterating.  */
7563           || fatal)
7564         return vectorized;
7565
7566       /* Try the next biggest vector size.  */
7567       next_vector_mode = vector_modes[mode_i++];
7568       if (dump_enabled_p ())
7569         dump_printf_loc (MSG_NOTE, vect_location,
7570                          "***** Re-trying analysis with vector mode %s\n",
7571                          GET_MODE_NAME (next_vector_mode));
7572     }
7573 }
7574
7575
7576 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7577    true if anything in the basic-block was vectorized.  */
7578
7579 static bool
7580 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7581 {
7582   vec<data_reference_p> datarefs = vNULL;
7583   auto_vec<int> dataref_groups;
7584   int insns = 0;
7585   int current_group = 0;
7586
7587   for (unsigned i = 0; i < bbs.length (); i++)
7588     {
7589       basic_block bb = bbs[i];
7590       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7591            gsi_next (&gsi))
7592         {
7593           gimple *stmt = gsi_stmt (gsi);
7594           if (is_gimple_debug (stmt))
7595             continue;
7596
7597           insns++;
7598
7599           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7600             vect_location = stmt;
7601
7602           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7603                                               &dataref_groups, current_group))
7604             ++current_group;
7605         }
7606       /* New BBs always start a new DR group.  */
7607       ++current_group;
7608     }
7609
7610   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7611 }
7612
7613 /* Special entry for the BB vectorizer.  Analyze and transform a single
7614    if-converted BB with ORIG_LOOPs body being the not if-converted
7615    representation.  Returns true if anything in the basic-block was
7616    vectorized.  */
7617
7618 bool
7619 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7620 {
7621   auto_vec<basic_block> bbs;
7622   bbs.safe_push (bb);
7623   return vect_slp_bbs (bbs, orig_loop);
7624 }
7625
7626 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7627    true if anything in the basic-block was vectorized.  */
7628
7629 bool
7630 vect_slp_function (function *fun)
7631 {
7632   bool r = false;
7633   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7634   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7635
7636   /* For the moment split the function into pieces to avoid making
7637      the iteration on the vector mode moot.  Split at points we know
7638      to not handle well which is CFG merges (SLP discovery doesn't
7639      handle non-loop-header PHIs) and loop exits.  Since pattern
7640      recog requires reverse iteration to visit uses before defs
7641      simply chop RPO into pieces.  */
7642   auto_vec<basic_block> bbs;
7643   for (unsigned i = 0; i < n; i++)
7644     {
7645       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7646       bool split = false;
7647
7648       /* Split when a BB is not dominated by the first block.  */
7649       if (!bbs.is_empty ()
7650           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7651         {
7652           if (dump_enabled_p ())
7653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7654                              "splitting region at dominance boundary bb%d\n",
7655                              bb->index);
7656           split = true;
7657         }
7658       /* Split when the loop determined by the first block
7659          is exited.  This is because we eventually insert
7660          invariants at region begin.  */
7661       else if (!bbs.is_empty ()
7662                && bbs[0]->loop_father != bb->loop_father
7663                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7664         {
7665           if (dump_enabled_p ())
7666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667                              "splitting region at loop %d exit at bb%d\n",
7668                              bbs[0]->loop_father->num, bb->index);
7669           split = true;
7670         }
7671
7672       if (split && !bbs.is_empty ())
7673         {
7674           r |= vect_slp_bbs (bbs, NULL);
7675           bbs.truncate (0);
7676           bbs.quick_push (bb);
7677         }
7678       else
7679         bbs.safe_push (bb);
7680
7681       /* When we have a stmt ending this block and defining a
7682          value we have to insert on edges when inserting after it for
7683          a vector containing its definition.  Avoid this for now.  */
7684       if (gimple *last = last_stmt (bb))
7685         if (gimple_get_lhs (last)
7686             && is_ctrl_altering_stmt (last))
7687           {
7688             if (dump_enabled_p ())
7689               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690                                "splitting region at control altering "
7691                                "definition %G", last);
7692             r |= vect_slp_bbs (bbs, NULL);
7693             bbs.truncate (0);
7694           }
7695     }
7696
7697   if (!bbs.is_empty ())
7698     r |= vect_slp_bbs (bbs, NULL);
7699
7700   free (rpo);
7701
7702   return r;
7703 }
7704
7705 /* Build a variable-length vector in which the elements in ELTS are repeated
7706    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7707    RESULTS and add any new instructions to SEQ.
7708
7709    The approach we use is:
7710
7711    (1) Find a vector mode VM with integer elements of mode IM.
7712
7713    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7714        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7715        from small vectors to IM.
7716
7717    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7718
7719    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7720        correct byte contents.
7721
7722    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7723
7724    We try to find the largest IM for which this sequence works, in order
7725    to cut down on the number of interleaves.  */
7726
7727 void
7728 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7729                           const vec<tree> &elts, unsigned int nresults,
7730                           vec<tree> &results)
7731 {
7732   unsigned int nelts = elts.length ();
7733   tree element_type = TREE_TYPE (vector_type);
7734
7735   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7736   unsigned int nvectors = 1;
7737   tree new_vector_type;
7738   tree permutes[2];
7739   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7740                                        &nvectors, &new_vector_type,
7741                                        permutes))
7742     gcc_unreachable ();
7743
7744   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7745   unsigned int partial_nelts = nelts / nvectors;
7746   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7747
7748   tree_vector_builder partial_elts;
7749   auto_vec<tree, 32> pieces (nvectors * 2);
7750   pieces.quick_grow_cleared (nvectors * 2);
7751   for (unsigned int i = 0; i < nvectors; ++i)
7752     {
7753       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7754              ELTS' has mode IM.  */
7755       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7756       for (unsigned int j = 0; j < partial_nelts; ++j)
7757         partial_elts.quick_push (elts[i * partial_nelts + j]);
7758       tree t = gimple_build_vector (seq, &partial_elts);
7759       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7760                         TREE_TYPE (new_vector_type), t);
7761
7762       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7763       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7764     }
7765
7766   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7767          correct byte contents.
7768
7769      Conceptually, we need to repeat the following operation log2(nvectors)
7770      times, where hi_start = nvectors / 2:
7771
7772         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7773         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7774
7775      However, if each input repeats every N elements and the VF is
7776      a multiple of N * 2, the HI result is the same as the LO result.
7777      This will be true for the first N1 iterations of the outer loop,
7778      followed by N2 iterations for which both the LO and HI results
7779      are needed.  I.e.:
7780
7781         N1 + N2 = log2(nvectors)
7782
7783      Each "N1 iteration" doubles the number of redundant vectors and the
7784      effect of the process as a whole is to have a sequence of nvectors/2**N1
7785      vectors that repeats 2**N1 times.  Rather than generate these redundant
7786      vectors, we halve the number of vectors for each N1 iteration.  */
7787   unsigned int in_start = 0;
7788   unsigned int out_start = nvectors;
7789   unsigned int new_nvectors = nvectors;
7790   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7791     {
7792       unsigned int hi_start = new_nvectors / 2;
7793       unsigned int out_i = 0;
7794       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7795         {
7796           if ((in_i & 1) != 0
7797               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7798                              2 * in_repeat))
7799             continue;
7800
7801           tree output = make_ssa_name (new_vector_type);
7802           tree input1 = pieces[in_start + (in_i / 2)];
7803           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7804           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7805                                                input1, input2,
7806                                                permutes[in_i & 1]);
7807           gimple_seq_add_stmt (seq, stmt);
7808           pieces[out_start + out_i] = output;
7809           out_i += 1;
7810         }
7811       std::swap (in_start, out_start);
7812       new_nvectors = out_i;
7813     }
7814
7815   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7816   results.reserve (nresults);
7817   for (unsigned int i = 0; i < nresults; ++i)
7818     if (i < new_nvectors)
7819       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7820                                         pieces[in_start + i]));
7821     else
7822       results.quick_push (results[i - new_nvectors]);
7823 }
7824
7825
7826 /* For constant and loop invariant defs in OP_NODE this function creates
7827    vector defs that will be used in the vectorized stmts and stores them
7828    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7829
7830 static void
7831 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7832 {
7833   unsigned HOST_WIDE_INT nunits;
7834   tree vec_cst;
7835   unsigned j, number_of_places_left_in_vector;
7836   tree vector_type;
7837   tree vop;
7838   int group_size = op_node->ops.length ();
7839   unsigned int vec_num, i;
7840   unsigned number_of_copies = 1;
7841   bool constant_p;
7842   gimple_seq ctor_seq = NULL;
7843   auto_vec<tree, 16> permute_results;
7844
7845   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
7846   vector_type = SLP_TREE_VECTYPE (op_node);
7847
7848   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7849   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7850   auto_vec<tree> voprnds (number_of_vectors);
7851
7852   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7853      created vectors. It is greater than 1 if unrolling is performed.
7854
7855      For example, we have two scalar operands, s1 and s2 (e.g., group of
7856      strided accesses of size two), while NUNITS is four (i.e., four scalars
7857      of this type can be packed in a vector).  The output vector will contain
7858      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
7859      will be 2).
7860
7861      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7862      containing the operands.
7863
7864      For example, NUNITS is four as before, and the group size is 8
7865      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
7866      {s5, s6, s7, s8}.  */
7867
7868   /* When using duplicate_and_interleave, we just need one element for
7869      each scalar statement.  */
7870   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7871     nunits = group_size;
7872
7873   number_of_copies = nunits * number_of_vectors / group_size;
7874
7875   number_of_places_left_in_vector = nunits;
7876   constant_p = true;
7877   tree_vector_builder elts (vector_type, nunits, 1);
7878   elts.quick_grow (nunits);
7879   stmt_vec_info insert_after = NULL;
7880   for (j = 0; j < number_of_copies; j++)
7881     {
7882       tree op;
7883       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7884         {
7885           /* Create 'vect_ = {op0,op1,...,opn}'.  */
7886           number_of_places_left_in_vector--;
7887           tree orig_op = op;
7888           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7889             {
7890               if (CONSTANT_CLASS_P (op))
7891                 {
7892                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7893                     {
7894                       /* Can't use VIEW_CONVERT_EXPR for booleans because
7895                          of possibly different sizes of scalar value and
7896                          vector element.  */
7897                       if (integer_zerop (op))
7898                         op = build_int_cst (TREE_TYPE (vector_type), 0);
7899                       else if (integer_onep (op))
7900                         op = build_all_ones_cst (TREE_TYPE (vector_type));
7901                       else
7902                         gcc_unreachable ();
7903                     }
7904                   else
7905                     op = fold_unary (VIEW_CONVERT_EXPR,
7906                                      TREE_TYPE (vector_type), op);
7907                   gcc_assert (op && CONSTANT_CLASS_P (op));
7908                 }
7909               else
7910                 {
7911                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7912                   gimple *init_stmt;
7913                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7914                     {
7915                       tree true_val
7916                         = build_all_ones_cst (TREE_TYPE (vector_type));
7917                       tree false_val
7918                         = build_zero_cst (TREE_TYPE (vector_type));
7919                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7920                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7921                                                        op, true_val,
7922                                                        false_val);
7923                     }
7924                   else
7925                     {
7926                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7927                                    op);
7928                       init_stmt
7929                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7930                                                op);
7931                     }
7932                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
7933                   op = new_temp;
7934                 }
7935             }
7936           elts[number_of_places_left_in_vector] = op;
7937           if (!CONSTANT_CLASS_P (op))
7938             constant_p = false;
7939           /* For BB vectorization we have to compute an insert location
7940              when a def is inside the analyzed region since we cannot
7941              simply insert at the BB start in this case.  */
7942           stmt_vec_info opdef;
7943           if (TREE_CODE (orig_op) == SSA_NAME
7944               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7945               && is_a <bb_vec_info> (vinfo)
7946               && (opdef = vinfo->lookup_def (orig_op)))
7947             {
7948               if (!insert_after)
7949                 insert_after = opdef;
7950               else
7951                 insert_after = get_later_stmt (insert_after, opdef);
7952             }
7953
7954           if (number_of_places_left_in_vector == 0)
7955             {
7956               if (constant_p
7957                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7958                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7959                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7960               else
7961                 {
7962                   if (permute_results.is_empty ())
7963                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7964                                               elts, number_of_vectors,
7965                                               permute_results);
7966                   vec_cst = permute_results[number_of_vectors - j - 1];
7967                 }
7968               if (!gimple_seq_empty_p (ctor_seq))
7969                 {
7970                   if (insert_after)
7971                     {
7972                       gimple_stmt_iterator gsi;
7973                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7974                         {
7975                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7976                           gsi_insert_seq_before (&gsi, ctor_seq,
7977                                                  GSI_CONTINUE_LINKING);
7978                         }
7979                       else if (!stmt_ends_bb_p (insert_after->stmt))
7980                         {
7981                           gsi = gsi_for_stmt (insert_after->stmt);
7982                           gsi_insert_seq_after (&gsi, ctor_seq,
7983                                                 GSI_CONTINUE_LINKING);
7984                         }
7985                       else
7986                         {
7987                           /* When we want to insert after a def where the
7988                              defining stmt throws then insert on the fallthru
7989                              edge.  */
7990                           edge e = find_fallthru_edge
7991                                      (gimple_bb (insert_after->stmt)->succs);
7992                           basic_block new_bb
7993                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
7994                           gcc_assert (!new_bb);
7995                         }
7996                     }
7997                   else
7998                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
7999                   ctor_seq = NULL;
8000                 }
8001               voprnds.quick_push (vec_cst);
8002               insert_after = NULL;
8003               number_of_places_left_in_vector = nunits;
8004               constant_p = true;
8005               elts.new_vector (vector_type, nunits, 1);
8006               elts.quick_grow (nunits);
8007             }
8008         }
8009     }
8010
8011   /* Since the vectors are created in the reverse order, we should invert
8012      them.  */
8013   vec_num = voprnds.length ();
8014   for (j = vec_num; j != 0; j--)
8015     {
8016       vop = voprnds[j - 1];
8017       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8018     }
8019
8020   /* In case that VF is greater than the unrolling factor needed for the SLP
8021      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8022      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8023      to replicate the vectors.  */
8024   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8025     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8026          i++)
8027       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8028 }
8029
8030 /* Get the Ith vectorized definition from SLP_NODE.  */
8031
8032 tree
8033 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8034 {
8035   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8036     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8037   else
8038     return SLP_TREE_VEC_DEFS (slp_node)[i];
8039 }
8040
8041 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8042
8043 void
8044 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8045 {
8046   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8047   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8048     {
8049       unsigned j;
8050       gimple *vec_def_stmt;
8051       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8052         vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8053     }
8054   else
8055     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8056 }
8057
8058 /* Get N vectorized definitions for SLP_NODE.  */
8059
8060 void
8061 vect_get_slp_defs (vec_info *,
8062                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8063 {
8064   if (n == -1U)
8065     n = SLP_TREE_CHILDREN (slp_node).length ();
8066
8067   for (unsigned i = 0; i < n; ++i)
8068     {
8069       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8070       vec<tree> vec_defs = vNULL;
8071       vect_get_slp_defs (child, &vec_defs);
8072       vec_oprnds->quick_push (vec_defs);
8073     }
8074 }
8075
8076 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8077    - PERM gives the permutation that the caller wants to use for NODE,
8078      which might be different from SLP_LOAD_PERMUTATION.
8079    - DUMP_P controls whether the function dumps information.  */
8080
8081 static bool
8082 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8083                                 load_permutation_t &perm,
8084                                 const vec<tree> &dr_chain,
8085                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8086                                 bool analyze_only, bool dump_p,
8087                                 unsigned *n_perms, unsigned int *n_loads,
8088                                 bool dce_chain)
8089 {
8090   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8091   int vec_index = 0;
8092   tree vectype = SLP_TREE_VECTYPE (node);
8093   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8094   unsigned int mask_element;
8095   machine_mode mode;
8096
8097   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8098     return false;
8099
8100   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8101
8102   mode = TYPE_MODE (vectype);
8103   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8104
8105   /* Initialize the vect stmts of NODE to properly insert the generated
8106      stmts later.  */
8107   if (! analyze_only)
8108     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
8109          i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
8110       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8111
8112   /* Generate permutation masks for every NODE. Number of masks for each NODE
8113      is equal to GROUP_SIZE.
8114      E.g., we have a group of three nodes with three loads from the same
8115      location in each node, and the vector size is 4. I.e., we have a
8116      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8117      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8118      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8119      ...
8120
8121      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8122      The last mask is illegal since we assume two operands for permute
8123      operation, and the mask element values can't be outside that range.
8124      Hence, the last mask must be converted into {2,5,5,5}.
8125      For the first two permutations we need the first and the second input
8126      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8127      we need the second and the third vectors: {b1,c1,a2,b2} and
8128      {c2,a3,b3,c3}.  */
8129
8130   int vect_stmts_counter = 0;
8131   unsigned int index = 0;
8132   int first_vec_index = -1;
8133   int second_vec_index = -1;
8134   bool noop_p = true;
8135   *n_perms = 0;
8136
8137   vec_perm_builder mask;
8138   unsigned int nelts_to_build;
8139   unsigned int nvectors_per_build;
8140   unsigned int in_nlanes;
8141   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8142                       && multiple_p (nunits, group_size));
8143   if (repeating_p)
8144     {
8145       /* A single vector contains a whole number of copies of the node, so:
8146          (a) all permutes can use the same mask; and
8147          (b) the permutes only need a single vector input.  */
8148       mask.new_vector (nunits, group_size, 3);
8149       nelts_to_build = mask.encoded_nelts ();
8150       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
8151       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8152     }
8153   else
8154     {
8155       /* We need to construct a separate mask for each vector statement.  */
8156       unsigned HOST_WIDE_INT const_nunits, const_vf;
8157       if (!nunits.is_constant (&const_nunits)
8158           || !vf.is_constant (&const_vf))
8159         return false;
8160       mask.new_vector (const_nunits, const_nunits, 1);
8161       nelts_to_build = const_vf * group_size;
8162       nvectors_per_build = 1;
8163       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8164     }
8165   auto_sbitmap used_in_lanes (in_nlanes);
8166   bitmap_clear (used_in_lanes);
8167   auto_bitmap used_defs;
8168
8169   unsigned int count = mask.encoded_nelts ();
8170   mask.quick_grow (count);
8171   vec_perm_indices indices;
8172
8173   for (unsigned int j = 0; j < nelts_to_build; j++)
8174     {
8175       unsigned int iter_num = j / group_size;
8176       unsigned int stmt_num = j % group_size;
8177       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8178       bitmap_set_bit (used_in_lanes, i);
8179       if (repeating_p)
8180         {
8181           first_vec_index = 0;
8182           mask_element = i;
8183         }
8184       else
8185         {
8186           /* Enforced before the loop when !repeating_p.  */
8187           unsigned int const_nunits = nunits.to_constant ();
8188           vec_index = i / const_nunits;
8189           mask_element = i % const_nunits;
8190           if (vec_index == first_vec_index
8191               || first_vec_index == -1)
8192             {
8193               first_vec_index = vec_index;
8194             }
8195           else if (vec_index == second_vec_index
8196                    || second_vec_index == -1)
8197             {
8198               second_vec_index = vec_index;
8199               mask_element += const_nunits;
8200             }
8201           else
8202             {
8203               if (dump_p)
8204                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8205                                  "permutation requires at "
8206                                  "least three vectors %G",
8207                                  stmt_info->stmt);
8208               gcc_assert (analyze_only);
8209               return false;
8210             }
8211
8212           gcc_assert (mask_element < 2 * const_nunits);
8213         }
8214
8215       if (mask_element != index)
8216         noop_p = false;
8217       mask[index++] = mask_element;
8218
8219       if (index == count && !noop_p)
8220         {
8221           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8222           if (!can_vec_perm_const_p (mode, mode, indices))
8223             {
8224               if (dump_p)
8225                 {
8226                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8227                                    vect_location,
8228                                    "unsupported vect permute { ");
8229                   for (i = 0; i < count; ++i)
8230                     {
8231                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8232                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8233                     }
8234                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8235                 }
8236               gcc_assert (analyze_only);
8237               return false;
8238             }
8239
8240           ++*n_perms;
8241         }
8242
8243       if (index == count)
8244         {
8245           if (!analyze_only)
8246             {
8247               tree mask_vec = NULL_TREE;
8248
8249               if (! noop_p)
8250                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8251
8252               if (second_vec_index == -1)
8253                 second_vec_index = first_vec_index;
8254
8255               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8256                 {
8257                   /* Generate the permute statement if necessary.  */
8258                   tree first_vec = dr_chain[first_vec_index + ri];
8259                   tree second_vec = dr_chain[second_vec_index + ri];
8260                   gimple *perm_stmt;
8261                   if (! noop_p)
8262                     {
8263                       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
8264                       tree perm_dest
8265                         = vect_create_destination_var (gimple_assign_lhs (stmt),
8266                                                        vectype);
8267                       perm_dest = make_ssa_name (perm_dest);
8268                       perm_stmt
8269                         = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8270                                                first_vec, second_vec,
8271                                                mask_vec);
8272                       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8273                                                    gsi);
8274                       if (dce_chain)
8275                         {
8276                           bitmap_set_bit (used_defs, first_vec_index + ri);
8277                           bitmap_set_bit (used_defs, second_vec_index + ri);
8278                         }
8279                     }
8280                   else
8281                     {
8282                       /* If mask was NULL_TREE generate the requested
8283                          identity transform.  */
8284                       perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8285                       if (dce_chain)
8286                         bitmap_set_bit (used_defs, first_vec_index + ri);
8287                     }
8288
8289                   /* Store the vector statement in NODE.  */
8290                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8291                 }
8292             }
8293
8294           index = 0;
8295           first_vec_index = -1;
8296           second_vec_index = -1;
8297           noop_p = true;
8298         }
8299     }
8300
8301   if (n_loads)
8302     {
8303       if (repeating_p)
8304         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8305       else
8306         {
8307           /* Enforced above when !repeating_p.  */
8308           unsigned int const_nunits = nunits.to_constant ();
8309           *n_loads = 0;
8310           bool load_seen = false;
8311           for (unsigned i = 0; i < in_nlanes; ++i)
8312             {
8313               if (i % const_nunits == 0)
8314                 {
8315                   if (load_seen)
8316                     *n_loads += 1;
8317                   load_seen = false;
8318                 }
8319               if (bitmap_bit_p (used_in_lanes, i))
8320                 load_seen = true;
8321             }
8322           if (load_seen)
8323             *n_loads += 1;
8324         }
8325     }
8326
8327   if (dce_chain)
8328     for (unsigned i = 0; i < dr_chain.length (); ++i)
8329       if (!bitmap_bit_p (used_defs, i))
8330         {
8331           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8332           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8333           gsi_remove (&rgsi, true);
8334           release_defs (stmt);
8335         }
8336
8337   return true;
8338 }
8339
8340 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8341    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8342    permute statements for the SLP node NODE.  Store the number of vector
8343    permute instructions in *N_PERMS and the number of vector load
8344    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8345    that were not needed.  */
8346
8347 bool
8348 vect_transform_slp_perm_load (vec_info *vinfo,
8349                               slp_tree node, const vec<tree> &dr_chain,
8350                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8351                               bool analyze_only, unsigned *n_perms,
8352                               unsigned int *n_loads, bool dce_chain)
8353 {
8354   return vect_transform_slp_perm_load_1 (vinfo, node,
8355                                          SLP_TREE_LOAD_PERMUTATION (node),
8356                                          dr_chain, gsi, vf, analyze_only,
8357                                          dump_enabled_p (), n_perms, n_loads,
8358                                          dce_chain);
8359 }
8360
8361 /* Produce the next vector result for SLP permutation NODE by adding a vector
8362    statement at GSI.  If MASK_VEC is nonnull, add:
8363
8364       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8365
8366    otherwise add:
8367
8368       <new SSA name> = FIRST_DEF.  */
8369
8370 static void
8371 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8372                           slp_tree node, tree first_def, tree second_def,
8373                           tree mask_vec)
8374 {
8375   tree vectype = SLP_TREE_VECTYPE (node);
8376
8377   /* ???  We SLP match existing vector element extracts but
8378      allow punning which we need to re-instantiate at uses
8379      but have no good way of explicitly representing.  */
8380   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8381       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8382     {
8383       gassign *conv_stmt
8384         = gimple_build_assign (make_ssa_name (vectype),
8385                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8386       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8387       first_def = gimple_assign_lhs (conv_stmt);
8388     }
8389   gassign *perm_stmt;
8390   tree perm_dest = make_ssa_name (vectype);
8391   if (mask_vec)
8392     {
8393       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8394                            TYPE_SIZE (vectype))
8395           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8396         {
8397           gassign *conv_stmt
8398             = gimple_build_assign (make_ssa_name (vectype),
8399                                    build1 (VIEW_CONVERT_EXPR,
8400                                            vectype, second_def));
8401           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8402           second_def = gimple_assign_lhs (conv_stmt);
8403         }
8404       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8405                                        first_def, second_def,
8406                                        mask_vec);
8407     }
8408   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8409     {
8410       /* For identity permutes we still need to handle the case
8411          of lowpart extracts or concats.  */
8412       unsigned HOST_WIDE_INT c;
8413       auto first_def_nunits
8414         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8415       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8416         {
8417           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8418                                  TYPE_SIZE (vectype), bitsize_zero_node);
8419           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8420         }
8421       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8422                                     first_def_nunits, &c) && c == 2)
8423         {
8424           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8425                                             NULL_TREE, second_def);
8426           perm_stmt = gimple_build_assign (perm_dest, ctor);
8427         }
8428       else
8429         gcc_unreachable ();
8430     }
8431   else
8432     {
8433       /* We need a copy here in case the def was external.  */
8434       perm_stmt = gimple_build_assign (perm_dest, first_def);
8435     }
8436   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8437   /* Store the vector statement in NODE.  */
8438   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8439 }
8440
8441 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8442    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8443    If GSI is nonnull, emit the permutation there.
8444
8445    When GSI is null, the only purpose of NODE is to give properties
8446    of the result, such as the vector type and number of SLP lanes.
8447    The node does not need to be a VEC_PERM_EXPR.
8448
8449    If the target supports the operation, return the number of individual
8450    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8451    dump file if DUMP_P is true.  */
8452
8453 static int
8454 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8455                                 slp_tree node, lane_permutation_t &perm,
8456                                 vec<slp_tree> &children, bool dump_p)
8457 {
8458   tree vectype = SLP_TREE_VECTYPE (node);
8459
8460   /* ???  We currently only support all same vector input types
8461      while the SLP IL should really do a concat + select and thus accept
8462      arbitrary mismatches.  */
8463   slp_tree child;
8464   unsigned i;
8465   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8466   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8467   tree op_vectype = NULL_TREE;
8468   FOR_EACH_VEC_ELT (children, i, child)
8469     if (SLP_TREE_VECTYPE (child))
8470       {
8471         op_vectype = SLP_TREE_VECTYPE (child);
8472         break;
8473       }
8474   if (!op_vectype)
8475     op_vectype = vectype;
8476   FOR_EACH_VEC_ELT (children, i, child)
8477     {
8478       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8479            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8480           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8481           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8482         {
8483           if (dump_p)
8484             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8485                              "Unsupported vector types in lane permutation\n");
8486           return -1;
8487         }
8488       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8489         repeating_p = false;
8490     }
8491
8492   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8493   if (dump_p)
8494     {
8495       dump_printf_loc (MSG_NOTE, vect_location,
8496                        "vectorizing permutation");
8497       for (unsigned i = 0; i < perm.length (); ++i)
8498         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8499       if (repeating_p)
8500         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8501       dump_printf (MSG_NOTE, "\n");
8502     }
8503
8504   /* REPEATING_P is true if every output vector is guaranteed to use the
8505      same permute vector.  We can handle that case for both variable-length
8506      and constant-length vectors, but we only handle other cases for
8507      constant-length vectors.
8508
8509      Set:
8510
8511      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8512        mask vector that we want to build.
8513
8514      - NCOPIES to the number of copies of PERM that we need in order
8515        to build the necessary permute mask vectors.
8516
8517      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8518        for each permute mask vector.  This is only relevant when GSI is
8519        nonnull.  */
8520   uint64_t npatterns;
8521   unsigned nelts_per_pattern;
8522   uint64_t ncopies;
8523   unsigned noutputs_per_mask;
8524   if (repeating_p)
8525     {
8526       /* We need a single permute mask vector that has the form:
8527
8528            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8529
8530          In other words, the original n-element permute in PERM is
8531          "unrolled" to fill a full vector.  The stepped vector encoding
8532          that we use for permutes requires 3n elements.  */
8533       npatterns = SLP_TREE_LANES (node);
8534       nelts_per_pattern = ncopies = 3;
8535       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8536     }
8537   else
8538     {
8539       /* Calculate every element of every permute mask vector explicitly,
8540          instead of relying on the pattern described above.  */
8541       if (!nunits.is_constant (&npatterns))
8542         return -1;
8543       nelts_per_pattern = ncopies = 1;
8544       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8545         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8546           return -1;
8547       noutputs_per_mask = 1;
8548     }
8549   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8550   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8551
8552   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8553      from the { SLP operand, scalar lane } permutation as recorded in the
8554      SLP node as intermediate step.  This part should already work
8555      with SLP children with arbitrary number of lanes.  */
8556   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8557   auto_vec<unsigned> active_lane;
8558   vperm.create (olanes);
8559   active_lane.safe_grow_cleared (children.length (), true);
8560   for (unsigned i = 0; i < ncopies; ++i)
8561     {
8562       for (unsigned pi = 0; pi < perm.length (); ++pi)
8563         {
8564           std::pair<unsigned, unsigned> p = perm[pi];
8565           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8566           if (repeating_p)
8567             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8568           else
8569             {
8570               /* We checked above that the vectors are constant-length.  */
8571               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8572               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8573               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8574               vperm.quick_push ({{p.first, vi}, vl});
8575             }
8576         }
8577       /* Advance to the next group.  */
8578       for (unsigned j = 0; j < children.length (); ++j)
8579         active_lane[j] += SLP_TREE_LANES (children[j]);
8580     }
8581
8582   if (dump_p)
8583     {
8584       dump_printf_loc (MSG_NOTE, vect_location,
8585                        "vectorizing permutation");
8586       for (unsigned i = 0; i < perm.length (); ++i)
8587         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8588       if (repeating_p)
8589         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8590       dump_printf (MSG_NOTE, "\n");
8591       dump_printf_loc (MSG_NOTE, vect_location, "as");
8592       for (unsigned i = 0; i < vperm.length (); ++i)
8593         {
8594           if (i != 0
8595               && (repeating_p
8596                   ? multiple_p (i, npatterns)
8597                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8598             dump_printf (MSG_NOTE, ",");
8599           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8600                        vperm[i].first.first, vperm[i].first.second,
8601                        vperm[i].second);
8602         }
8603       dump_printf (MSG_NOTE, "\n");
8604     }
8605
8606   /* We can only handle two-vector permutes, everything else should
8607      be lowered on the SLP level.  The following is closely inspired
8608      by vect_transform_slp_perm_load and is supposed to eventually
8609      replace it.
8610      ???   As intermediate step do code-gen in the SLP tree representation
8611      somehow?  */
8612   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8613   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8614   unsigned int index = 0;
8615   poly_uint64 mask_element;
8616   vec_perm_builder mask;
8617   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8618   unsigned int count = mask.encoded_nelts ();
8619   mask.quick_grow (count);
8620   vec_perm_indices indices;
8621   unsigned nperms = 0;
8622   for (unsigned i = 0; i < vperm.length (); ++i)
8623     {
8624       mask_element = vperm[i].second;
8625       if (first_vec.first == -1U
8626           || first_vec == vperm[i].first)
8627         first_vec = vperm[i].first;
8628       else if (second_vec.first == -1U
8629                || second_vec == vperm[i].first)
8630         {
8631           second_vec = vperm[i].first;
8632           mask_element += nunits;
8633         }
8634       else
8635         {
8636           if (dump_p)
8637             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8638                              "permutation requires at "
8639                              "least three vectors\n");
8640           gcc_assert (!gsi);
8641           return -1;
8642         }
8643
8644       mask[index++] = mask_element;
8645
8646       if (index == count)
8647         {
8648           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8649                               TYPE_VECTOR_SUBPARTS (op_vectype));
8650           bool identity_p = indices.series_p (0, 1, 0, 1);
8651           machine_mode vmode = TYPE_MODE (vectype);
8652           machine_mode op_vmode = TYPE_MODE (op_vectype);
8653           unsigned HOST_WIDE_INT c;
8654           if ((!identity_p
8655                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8656               || (identity_p
8657                   && !known_le (nunits,
8658                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8659                   && (!constant_multiple_p (nunits,
8660                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8661                                             &c) || c != 2)))
8662             {
8663               if (dump_p)
8664                 {
8665                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8666                                    vect_location,
8667                                    "unsupported vect permute { ");
8668                   for (i = 0; i < count; ++i)
8669                     {
8670                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8671                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8672                     }
8673                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8674                 }
8675               gcc_assert (!gsi);
8676               return -1;
8677             }
8678
8679           if (!identity_p)
8680             nperms++;
8681           if (gsi)
8682             {
8683               if (second_vec.first == -1U)
8684                 second_vec = first_vec;
8685
8686               slp_tree
8687                 first_node = children[first_vec.first],
8688                 second_node = children[second_vec.first];
8689
8690               tree mask_vec = NULL_TREE;
8691               if (!identity_p)
8692                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8693
8694               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8695                 {
8696                   tree first_def
8697                     = vect_get_slp_vect_def (first_node,
8698                                              first_vec.second + vi);
8699                   tree second_def
8700                     = vect_get_slp_vect_def (second_node,
8701                                              second_vec.second + vi);
8702                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8703                                             second_def, mask_vec);
8704                 }
8705             }
8706
8707           index = 0;
8708           first_vec = std::make_pair (-1U, -1U);
8709           second_vec = std::make_pair (-1U, -1U);
8710         }
8711     }
8712
8713   return nperms;
8714 }
8715
8716 /* Vectorize the SLP permutations in NODE as specified
8717    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8718    child number and lane number.
8719    Interleaving of two two-lane two-child SLP subtrees (not supported):
8720      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8721    A blend of two four-lane two-child SLP subtrees:
8722      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8723    Highpart of a four-lane one-child SLP subtree (not supported):
8724      [ { 0, 2 }, { 0, 3 } ]
8725    Where currently only a subset is supported by code generating below.  */
8726
8727 static bool
8728 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8729                               slp_tree node, stmt_vector_for_cost *cost_vec)
8730 {
8731   tree vectype = SLP_TREE_VECTYPE (node);
8732   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8733   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8734                                                SLP_TREE_CHILDREN (node),
8735                                                dump_enabled_p ());
8736   if (nperms < 0)
8737     return false;
8738
8739   if (!gsi)
8740     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8741
8742   return true;
8743 }
8744
8745 /* Vectorize SLP NODE.  */
8746
8747 static void
8748 vect_schedule_slp_node (vec_info *vinfo,
8749                         slp_tree node, slp_instance instance)
8750 {
8751   gimple_stmt_iterator si;
8752   int i;
8753   slp_tree child;
8754
8755   /* For existing vectors there's nothing to do.  */
8756   if (SLP_TREE_VEC_DEFS (node).exists ())
8757     return;
8758
8759   gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8760
8761   /* Vectorize externals and constants.  */
8762   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8763       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8764     {
8765       /* ???  vectorizable_shift can end up using a scalar operand which is
8766          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8767          node in this case.  */
8768       if (!SLP_TREE_VECTYPE (node))
8769         return;
8770
8771       vect_create_constant_vectors (vinfo, node);
8772       return;
8773     }
8774
8775   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8776
8777   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8778   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8779
8780   if (dump_enabled_p ())
8781     dump_printf_loc (MSG_NOTE, vect_location,
8782                      "------>vectorizing SLP node starting from: %G",
8783                      stmt_info->stmt);
8784
8785   if (STMT_VINFO_DATA_REF (stmt_info)
8786       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8787     {
8788       /* Vectorized loads go before the first scalar load to make it
8789          ready early, vectorized stores go before the last scalar
8790          stmt which is where all uses are ready.  */
8791       stmt_vec_info last_stmt_info = NULL;
8792       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8793         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8794       else /* DR_IS_WRITE */
8795         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8796       si = gsi_for_stmt (last_stmt_info->stmt);
8797     }
8798   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8799             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8800             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8801            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8802     {
8803       /* For PHI node vectorization we do not use the insertion iterator.  */
8804       si = gsi_none ();
8805     }
8806   else
8807     {
8808       /* Emit other stmts after the children vectorized defs which is
8809          earliest possible.  */
8810       gimple *last_stmt = NULL;
8811       bool seen_vector_def = false;
8812       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8813         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8814           {
8815             /* For fold-left reductions we are retaining the scalar
8816                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8817                set so the representation isn't perfect.  Resort to the
8818                last scalar def here.  */
8819             if (SLP_TREE_VEC_STMTS (child).is_empty ())
8820               {
8821                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8822                             == cycle_phi_info_type);
8823                 gphi *phi = as_a <gphi *>
8824                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8825                 if (!last_stmt
8826                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8827                   last_stmt = phi;
8828               }
8829             /* We are emitting all vectorized stmts in the same place and
8830                the last one is the last.
8831                ???  Unless we have a load permutation applied and that
8832                figures to re-use an earlier generated load.  */
8833             unsigned j;
8834             gimple *vstmt;
8835             FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8836               if (!last_stmt
8837                   || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8838                 last_stmt = vstmt;
8839           }
8840         else if (!SLP_TREE_VECTYPE (child))
8841           {
8842             /* For externals we use unvectorized at all scalar defs.  */
8843             unsigned j;
8844             tree def;
8845             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8846               if (TREE_CODE (def) == SSA_NAME
8847                   && !SSA_NAME_IS_DEFAULT_DEF (def))
8848                 {
8849                   gimple *stmt = SSA_NAME_DEF_STMT (def);
8850                   if (!last_stmt
8851                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8852                     last_stmt = stmt;
8853                 }
8854           }
8855         else
8856           {
8857             /* For externals we have to look at all defs since their
8858                insertion place is decided per vector.  But beware
8859                of pre-existing vectors where we need to make sure
8860                we do not insert before the region boundary.  */
8861             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8862                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8863               seen_vector_def = true;
8864             else
8865               {
8866                 unsigned j;
8867                 tree vdef;
8868                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8869                   if (TREE_CODE (vdef) == SSA_NAME
8870                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8871                     {
8872                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8873                       if (!last_stmt
8874                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8875                         last_stmt = vstmt;
8876                     }
8877               }
8878           }
8879       /* This can happen when all children are pre-existing vectors or
8880          constants.  */
8881       if (!last_stmt)
8882         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8883       if (!last_stmt)
8884         {
8885           gcc_assert (seen_vector_def);
8886           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8887         }
8888       else if (is_ctrl_altering_stmt (last_stmt))
8889         {
8890           /* We split regions to vectorize at control altering stmts
8891              with a definition so this must be an external which
8892              we can insert at the start of the region.  */
8893           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8894         }
8895       else if (is_a <bb_vec_info> (vinfo)
8896                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8897                && gimple_could_trap_p (stmt_info->stmt))
8898         {
8899           /* We've constrained possibly trapping operations to all come
8900              from the same basic-block, if vectorized defs would allow earlier
8901              scheduling still force vectorized stmts to the original block.
8902              This is only necessary for BB vectorization since for loop vect
8903              all operations are in a single BB and scalar stmt based
8904              placement doesn't play well with epilogue vectorization.  */
8905           gcc_assert (dominated_by_p (CDI_DOMINATORS,
8906                                       gimple_bb (stmt_info->stmt),
8907                                       gimple_bb (last_stmt)));
8908           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8909         }
8910       else if (is_a <gphi *> (last_stmt))
8911         si = gsi_after_labels (gimple_bb (last_stmt));
8912       else
8913         {
8914           si = gsi_for_stmt (last_stmt);
8915           gsi_next (&si);
8916         }
8917     }
8918
8919   /* Handle purely internal nodes.  */
8920   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8921     {
8922       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
8923          be shared with different SLP nodes (but usually it's the same
8924          operation apart from the case the stmt is only there for denoting
8925          the actual scalar lane defs ...).  So do not call vect_transform_stmt
8926          but open-code it here (partly).  */
8927       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8928       gcc_assert (done);
8929       stmt_vec_info slp_stmt_info;
8930       unsigned int i;
8931       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8932         if (STMT_VINFO_LIVE_P (slp_stmt_info))
8933           {
8934             done = vectorizable_live_operation (vinfo,
8935                                                 slp_stmt_info, &si, node,
8936                                                 instance, i, true, NULL);
8937             gcc_assert (done);
8938           }
8939     }
8940   else
8941     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8942 }
8943
8944 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8945    For loop vectorization this is done in vectorizable_call, but for SLP
8946    it needs to be deferred until end of vect_schedule_slp, because multiple
8947    SLP instances may refer to the same scalar stmt.  */
8948
8949 static void
8950 vect_remove_slp_scalar_calls (vec_info *vinfo,
8951                               slp_tree node, hash_set<slp_tree> &visited)
8952 {
8953   gimple *new_stmt;
8954   gimple_stmt_iterator gsi;
8955   int i;
8956   slp_tree child;
8957   tree lhs;
8958   stmt_vec_info stmt_info;
8959
8960   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8961     return;
8962
8963   if (visited.add (node))
8964     return;
8965
8966   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8967     vect_remove_slp_scalar_calls (vinfo, child, visited);
8968
8969   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8970     {
8971       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8972       if (!stmt || gimple_bb (stmt) == NULL)
8973         continue;
8974       if (is_pattern_stmt_p (stmt_info)
8975           || !PURE_SLP_STMT (stmt_info))
8976         continue;
8977       lhs = gimple_call_lhs (stmt);
8978       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
8979       gsi = gsi_for_stmt (stmt);
8980       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
8981       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
8982     }
8983 }
8984
8985 static void
8986 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
8987 {
8988   hash_set<slp_tree> visited;
8989   vect_remove_slp_scalar_calls (vinfo, node, visited);
8990 }
8991
8992 /* Vectorize the instance root.  */
8993
8994 void
8995 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
8996 {
8997   gassign *rstmt = NULL;
8998
8999   if (instance->kind == slp_inst_kind_ctor)
9000     {
9001       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9002         {
9003           gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
9004           tree vect_lhs = gimple_get_lhs (child_stmt);
9005           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9006           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9007                                           TREE_TYPE (vect_lhs)))
9008             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9009                                vect_lhs);
9010           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9011         }
9012       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9013         {
9014           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9015           gimple *child_stmt;
9016           int j;
9017           vec<constructor_elt, va_gc> *v;
9018           vec_alloc (v, nelts);
9019
9020           /* A CTOR can handle V16HI composition from VNx8HI so we
9021              do not need to convert vector elements if the types
9022              do not match.  */
9023           FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
9024             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
9025                                     gimple_get_lhs (child_stmt));
9026           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9027           tree rtype
9028             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9029           tree r_constructor = build_constructor (rtype, v);
9030           rstmt = gimple_build_assign (lhs, r_constructor);
9031         }
9032     }
9033   else if (instance->kind == slp_inst_kind_bb_reduc)
9034     {
9035       /* Largely inspired by reduction chain epilogue handling in
9036          vect_create_epilog_for_reduction.  */
9037       vec<tree> vec_defs = vNULL;
9038       vect_get_slp_defs (node, &vec_defs);
9039       enum tree_code reduc_code
9040         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9041       /* ???  We actually have to reflect signs somewhere.  */
9042       if (reduc_code == MINUS_EXPR)
9043         reduc_code = PLUS_EXPR;
9044       gimple_seq epilogue = NULL;
9045       /* We may end up with more than one vector result, reduce them
9046          to one vector.  */
9047       tree vec_def = vec_defs[0];
9048       for (unsigned i = 1; i < vec_defs.length (); ++i)
9049         vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9050                                 vec_def, vec_defs[i]);
9051       vec_defs.release ();
9052       /* ???  Support other schemes than direct internal fn.  */
9053       internal_fn reduc_fn;
9054       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9055           || reduc_fn == IFN_LAST)
9056         gcc_unreachable ();
9057       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9058                                       TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9059
9060       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9061       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9062       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9063       update_stmt (gsi_stmt (rgsi));
9064       return;
9065     }
9066   else
9067     gcc_unreachable ();
9068
9069   gcc_assert (rstmt);
9070
9071   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9072   gsi_replace (&rgsi, rstmt, true);
9073 }
9074
9075 struct slp_scc_info
9076 {
9077   bool on_stack;
9078   int dfs;
9079   int lowlink;
9080 };
9081
9082 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9083
9084 static void
9085 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9086                    hash_map<slp_tree, slp_scc_info> &scc_info,
9087                    int &maxdfs, vec<slp_tree> &stack)
9088 {
9089   bool existed_p;
9090   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9091   gcc_assert (!existed_p);
9092   info->dfs = maxdfs;
9093   info->lowlink = maxdfs;
9094   maxdfs++;
9095
9096   /* Leaf.  */
9097   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9098     {
9099       info->on_stack = false;
9100       vect_schedule_slp_node (vinfo, node, instance);
9101       return;
9102     }
9103
9104   info->on_stack = true;
9105   stack.safe_push (node);
9106
9107   unsigned i;
9108   slp_tree child;
9109   /* DFS recurse.  */
9110   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9111     {
9112       if (!child)
9113         continue;
9114       slp_scc_info *child_info = scc_info.get (child);
9115       if (!child_info)
9116         {
9117           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9118           /* Recursion might have re-allocated the node.  */
9119           info = scc_info.get (node);
9120           child_info = scc_info.get (child);
9121           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9122         }
9123       else if (child_info->on_stack)
9124         info->lowlink = MIN (info->lowlink, child_info->dfs);
9125     }
9126   if (info->lowlink != info->dfs)
9127     return;
9128
9129   auto_vec<slp_tree, 4> phis_to_fixup;
9130
9131   /* Singleton.  */
9132   if (stack.last () == node)
9133     {
9134       stack.pop ();
9135       info->on_stack = false;
9136       vect_schedule_slp_node (vinfo, node, instance);
9137       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9138           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9139         phis_to_fixup.quick_push (node);
9140     }
9141   else
9142     {
9143       /* SCC.  */
9144       int last_idx = stack.length () - 1;
9145       while (stack[last_idx] != node)
9146         last_idx--;
9147       /* We can break the cycle at PHIs who have at least one child
9148          code generated.  Then we could re-start the DFS walk until
9149          all nodes in the SCC are covered (we might have new entries
9150          for only back-reachable nodes).  But it's simpler to just
9151          iterate and schedule those that are ready.  */
9152       unsigned todo = stack.length () - last_idx;
9153       do
9154         {
9155           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9156             {
9157               slp_tree entry = stack[idx];
9158               if (!entry)
9159                 continue;
9160               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9161                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9162               bool ready = !phi;
9163               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9164                   if (!child)
9165                     {
9166                       gcc_assert (phi);
9167                       ready = true;
9168                       break;
9169                     }
9170                   else if (scc_info.get (child)->on_stack)
9171                     {
9172                       if (!phi)
9173                         {
9174                           ready = false;
9175                           break;
9176                         }
9177                     }
9178                   else
9179                     {
9180                       if (phi)
9181                         {
9182                           ready = true;
9183                           break;
9184                         }
9185                     }
9186               if (ready)
9187                 {
9188                   vect_schedule_slp_node (vinfo, entry, instance);
9189                   scc_info.get (entry)->on_stack = false;
9190                   stack[idx] = NULL;
9191                   todo--;
9192                   if (phi)
9193                     phis_to_fixup.safe_push (entry);
9194                 }
9195             }
9196         }
9197       while (todo != 0);
9198
9199       /* Pop the SCC.  */
9200       stack.truncate (last_idx);
9201     }
9202
9203   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9204   slp_tree phi_node;
9205   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9206     {
9207       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9208       edge_iterator ei;
9209       edge e;
9210       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9211         {
9212           unsigned dest_idx = e->dest_idx;
9213           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9214           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9215             continue;
9216           unsigned n = SLP_TREE_VEC_STMTS (phi_node).length ();
9217           /* Simply fill all args.  */
9218           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9219               != vect_first_order_recurrence)
9220             for (unsigned i = 0; i < n; ++i)
9221               add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9222                            vect_get_slp_vect_def (child, i),
9223                            e, gimple_phi_arg_location (phi, dest_idx));
9224           else
9225             {
9226               /* Unless it is a first order recurrence which needs
9227                  args filled in for both the PHI node and the permutes.  */
9228               gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[0];
9229               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9230               add_phi_arg (as_a <gphi *> (rphi),
9231                            vect_get_slp_vect_def (child, n - 1),
9232                            e, gimple_phi_arg_location (phi, dest_idx));
9233               for (unsigned i = 0; i < n; ++i)
9234                 {
9235                   gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[i];
9236                   if (i > 0)
9237                     gimple_assign_set_rhs1 (perm,
9238                                             vect_get_slp_vect_def (child, i - 1));
9239                   gimple_assign_set_rhs2 (perm,
9240                                           vect_get_slp_vect_def (child, i));
9241                   update_stmt (perm);
9242                 }
9243             }
9244         }
9245     }
9246 }
9247
9248 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9249
9250 void
9251 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9252 {
9253   slp_instance instance;
9254   unsigned int i;
9255
9256   hash_map<slp_tree, slp_scc_info> scc_info;
9257   int maxdfs = 0;
9258   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9259     {
9260       slp_tree node = SLP_INSTANCE_TREE (instance);
9261       if (dump_enabled_p ())
9262         {
9263           dump_printf_loc (MSG_NOTE, vect_location,
9264                            "Vectorizing SLP tree:\n");
9265           /* ???  Dump all?  */
9266           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9267             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9268                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9269           vect_print_slp_graph (MSG_NOTE, vect_location,
9270                                 SLP_INSTANCE_TREE (instance));
9271         }
9272       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9273          have a PHI be the node breaking the cycle.  */
9274       auto_vec<slp_tree> stack;
9275       if (!scc_info.get (node))
9276         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9277
9278       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9279         vectorize_slp_instance_root_stmt (node, instance);
9280
9281       if (dump_enabled_p ())
9282         dump_printf_loc (MSG_NOTE, vect_location,
9283                          "vectorizing stmts using SLP.\n");
9284     }
9285
9286   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9287     {
9288       slp_tree root = SLP_INSTANCE_TREE (instance);
9289       stmt_vec_info store_info;
9290       unsigned int j;
9291
9292       /* Remove scalar call stmts.  Do not do this for basic-block
9293          vectorization as not all uses may be vectorized.
9294          ???  Why should this be necessary?  DCE should be able to
9295          remove the stmts itself.
9296          ???  For BB vectorization we can as well remove scalar
9297          stmts starting from the SLP tree root if they have no
9298          uses.  */
9299       if (is_a <loop_vec_info> (vinfo))
9300         vect_remove_slp_scalar_calls (vinfo, root);
9301
9302       /* Remove vectorized stores original scalar stmts.  */
9303       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9304         {
9305           if (!STMT_VINFO_DATA_REF (store_info)
9306               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9307             break;
9308
9309           store_info = vect_orig_stmt (store_info);
9310           /* Free the attached stmt_vec_info and remove the stmt.  */
9311           vinfo->remove_stmt (store_info);
9312
9313           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9314              to not crash in vect_free_slp_tree later.  */
9315           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9316             SLP_TREE_REPRESENTATIVE (root) = NULL;
9317         }
9318     }
9319 }