gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_STMTS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_STMTS (this).release ();
 145   SLP_TREE_VEC_DEFS (this).release ();
 146   SLP_TREE_LOAD_PERMUTATION (this).release ();
 147   SLP_TREE_LANE_PERMUTATION (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 153
 154 void
 155 vect_free_slp_tree (slp_tree node)
 156 {
 157   int i;
 158   slp_tree child;
 159
 160   if (--SLP_TREE_REF_COUNT (node) != 0)
 161     return;
 162
 163   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 164     if (child)
 165       vect_free_slp_tree (child);
 166
 167   /* If the node defines any SLP only patterns then those patterns are no
 168      longer valid and should be removed.  */
 169   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 170   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 171     {
 172       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 173       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 174       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 175     }
 176
 177   delete node;
 178 }
 179
 180 /* Return a location suitable for dumpings related to the SLP instance.  */
 181
 182 dump_user_location_t
 183 _slp_instance::location () const
 184 {
 185   if (!root_stmts.is_empty ())
 186     return root_stmts[0]->stmt;
 187   else
 188     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 189 }
 190
 191
 192 /* Free the memory allocated for the SLP instance.  */
 193
 194 void
 195 vect_free_slp_instance (slp_instance instance)
 196 {
 197   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 198   SLP_INSTANCE_LOADS (instance).release ();
 199   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 200   instance->subgraph_entries.release ();
 201   instance->cost_vec.release ();
 202   free (instance);
 203 }
 204
 205
 206 /* Create an SLP node for SCALAR_STMTS.  */
 207
 208 slp_tree
 209 vect_create_new_slp_node (unsigned nops, tree_code code)
 210 {
 211   slp_tree node = new _slp_tree;
 212   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 213   SLP_TREE_CHILDREN (node).create (nops);
 214   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 215   SLP_TREE_CODE (node) = code;
 216   return node;
 217 }
 218 /* Create an SLP node for SCALAR_STMTS.  */
 219
 220 static slp_tree
 221 vect_create_new_slp_node (slp_tree node,
 222                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 223 {
 224   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 225   SLP_TREE_CHILDREN (node).create (nops);
 226   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 227   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 228   SLP_TREE_LANES (node) = scalar_stmts.length ();
 229   return node;
 230 }
 231
 232 /* Create an SLP node for SCALAR_STMTS.  */
 233
 234 static slp_tree
 235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 238 }
 239
 240 /* Create an SLP node for OPS.  */
 241
 242 static slp_tree
 243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 244 {
 245   SLP_TREE_SCALAR_OPS (node) = ops;
 246   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 247   SLP_TREE_LANES (node) = ops.length ();
 248   return node;
 249 }
 250
 251 /* Create an SLP node for OPS.  */
 252
 253 static slp_tree
 254 vect_create_new_slp_node (vec<tree> ops)
 255 {
 256   return vect_create_new_slp_node (new _slp_tree, ops);
 257 }
 258
 259
 260 /* This structure is used in creation of an SLP tree.  Each instance
 261    corresponds to the same operand in a group of scalar stmts in an SLP
 262    node.  */
 263 typedef struct _slp_oprnd_info
 264 {
 265   /* Def-stmts for the operands.  */
 266   vec<stmt_vec_info> def_stmts;
 267   /* Operands.  */
 268   vec<tree> ops;
 269   /* Information about the first statement, its vector def-type, type, the
 270      operand itself in case it's constant, and an indication if it's a pattern
 271      stmt.  */
 272   tree first_op_type;
 273   enum vect_def_type first_dt;
 274   bool any_pattern;
 275 } *slp_oprnd_info;
 276
 277
 278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 279    operand.  */
 280 static vec<slp_oprnd_info>
 281 vect_create_oprnd_info (int nops, int group_size)
 282 {
 283   int i;
 284   slp_oprnd_info oprnd_info;
 285   vec<slp_oprnd_info> oprnds_info;
 286
 287   oprnds_info.create (nops);
 288   for (i = 0; i < nops; i++)
 289     {
 290       oprnd_info = XNEW (struct _slp_oprnd_info);
 291       oprnd_info->def_stmts.create (group_size);
 292       oprnd_info->ops.create (group_size);
 293       oprnd_info->first_dt = vect_uninitialized_def;
 294       oprnd_info->first_op_type = NULL_TREE;
 295       oprnd_info->any_pattern = false;
 296       oprnds_info.quick_push (oprnd_info);
 297     }
 298
 299   return oprnds_info;
 300 }
 301
 302
 303 /* Free operands info.  */
 304
 305 static void
 306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 307 {
 308   int i;
 309   slp_oprnd_info oprnd_info;
 310
 311   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 312     {
 313       oprnd_info->def_stmts.release ();
 314       oprnd_info->ops.release ();
 315       XDELETE (oprnd_info);
 316     }
 317
 318   oprnds_info.release ();
 319 }
 320
 321 /* Return the execution frequency of NODE (so that a higher value indicates
 322    a "more important" node when optimizing for speed).  */
 323
 324 static sreal
 325 vect_slp_node_weight (slp_tree node)
 326 {
 327   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 328   basic_block bb = gimple_bb (stmt_info->stmt);
 329   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 330 }
 331
 332 /* Return true if STMTS contains a pattern statement.  */
 333
 334 static bool
 335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 336 {
 337   stmt_vec_info stmt_info;
 338   unsigned int i;
 339   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 340     if (is_pattern_stmt_p (stmt_info))
 341       return true;
 342   return false;
 343 }
 344
 345 /* Return true when all lanes in the external or constant NODE have
 346    the same value.  */
 347
 348 static bool
 349 vect_slp_tree_uniform_p (slp_tree node)
 350 {
 351   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 352               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 353
 354   /* Pre-exsting vectors.  */
 355   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 356     return false;
 357
 358   unsigned i;
 359   tree op, first = NULL_TREE;
 360   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 361     if (!first)
 362       first = op;
 363     else if (!operand_equal_p (first, op, 0))
 364       return false;
 365
 366   return true;
 367 }
 368
 369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 370    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 371    of the chain.  */
 372
 373 int
 374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 375                                       stmt_vec_info first_stmt_info)
 376 {
 377   stmt_vec_info next_stmt_info = first_stmt_info;
 378   int result = 0;
 379
 380   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 381     return -1;
 382
 383   do
 384     {
 385       if (next_stmt_info == stmt_info)
 386         return result;
 387       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 388       if (next_stmt_info)
 389         result += DR_GROUP_GAP (next_stmt_info);
 390     }
 391   while (next_stmt_info);
 392
 393   return -1;
 394 }
 395
 396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 397    using the method implemented by duplicate_and_interleave.  Return true
 398    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 399    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 400    (if nonnull).  */
 401
 402 bool
 403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 404                                 tree elt_type, unsigned int *nvectors_out,
 405                                 tree *vector_type_out,
 406                                 tree *permutes)
 407 {
 408   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 409   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 410     return false;
 411
 412   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 413   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 414   unsigned int nvectors = 1;
 415   for (;;)
 416     {
 417       scalar_int_mode int_mode;
 418       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 419       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 420         {
 421           /* Get the natural vector type for this SLP group size.  */
 422           tree int_type = build_nonstandard_integer_type
 423             (GET_MODE_BITSIZE (int_mode), 1);
 424           tree vector_type
 425             = get_vectype_for_scalar_type (vinfo, int_type, count);
 426           if (vector_type
 427               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 428               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 429                            GET_MODE_SIZE (base_vector_mode)))
 430             {
 431               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 432                  together into elements of type INT_TYPE and using the result
 433                  to build NVECTORS vectors.  */
 434               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 435               vec_perm_builder sel1 (nelts, 2, 3);
 436               vec_perm_builder sel2 (nelts, 2, 3);
 437               poly_int64 half_nelts = exact_div (nelts, 2);
 438               for (unsigned int i = 0; i < 3; ++i)
 439                 {
 440                   sel1.quick_push (i);
 441                   sel1.quick_push (i + nelts);
 442                   sel2.quick_push (half_nelts + i);
 443                   sel2.quick_push (half_nelts + i + nelts);
 444                 }
 445               vec_perm_indices indices1 (sel1, 2, nelts);
 446               vec_perm_indices indices2 (sel2, 2, nelts);
 447               machine_mode vmode = TYPE_MODE (vector_type);
 448               if (can_vec_perm_const_p (vmode, vmode, indices1)
 449                   && can_vec_perm_const_p (vmode, vmode, indices2))
 450                 {
 451                   if (nvectors_out)
 452                     *nvectors_out = nvectors;
 453                   if (vector_type_out)
 454                     *vector_type_out = vector_type;
 455                   if (permutes)
 456                     {
 457                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 458                                                                 indices1);
 459                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 460                                                                 indices2);
 461                     }
 462                   return true;
 463                 }
 464             }
 465         }
 466       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 467         return false;
 468       nvectors *= 2;
 469     }
 470 }
 471
 472 /* Return true if DTA and DTB match.  */
 473
 474 static bool
 475 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 476 {
 477   return (dta == dtb
 478           || ((dta == vect_external_def || dta == vect_constant_def)
 479               && (dtb == vect_external_def || dtb == vect_constant_def)));
 480 }
 481
 482 static const int cond_expr_maps[3][5] = {
 483   { 4, -1, -2, 1, 2 },
 484   { 4, -2, -1, 1, 2 },
 485   { 4, -1, -2, 2, 1 }
 486 };
 487 static const int arg1_map[] = { 1, 1 };
 488 static const int arg2_map[] = { 1, 2 };
 489 static const int arg1_arg4_map[] = { 2, 1, 4 };
 490 static const int op1_op0_map[] = { 2, 1, 0 };
 491
 492 /* For most SLP statements, there is a one-to-one mapping between
 493    gimple arguments and child nodes.  If that is not true for STMT,
 494    return an array that contains:
 495
 496    - the number of child nodes, followed by
 497    - for each child node, the index of the argument associated with that node.
 498      The special index -1 is the first operand of an embedded comparison and
 499      the special index -2 is the second operand of an embedded comparison.
 500
 501    SWAP is as for vect_get_and_check_slp_defs.  */
 502
 503 static const int *
 504 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 505 {
 506   if (auto assign = dyn_cast<const gassign *> (stmt))
 507     {
 508       if (gimple_assign_rhs_code (assign) == COND_EXPR
 509           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 510         return cond_expr_maps[swap];
 511       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 512           && swap)
 513         return op1_op0_map;
 514     }
 515   gcc_assert (!swap);
 516   if (auto call = dyn_cast<const gcall *> (stmt))
 517     {
 518       if (gimple_call_internal_p (call))
 519         switch (gimple_call_internal_fn (call))
 520           {
 521           case IFN_MASK_LOAD:
 522             return arg2_map;
 523
 524           case IFN_GATHER_LOAD:
 525             return arg1_map;
 526
 527           case IFN_MASK_GATHER_LOAD:
 528             return arg1_arg4_map;
 529
 530           default:
 531             break;
 532           }
 533     }
 534   return nullptr;
 535 }
 536
 537 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 538    they are of a valid type and that they match the defs of the first stmt of
 539    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 540    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 541    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 542    is 1 if STMT is cond and operands of comparison need to be swapped;
 543    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 544
 545    If there was a fatal error return -1; if the error could be corrected by
 546    swapping operands of father node of this one, return 1; if everything is
 547    ok return 0.  */
 548 static int
 549 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 550                              bool *skip_args,
 551                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 552                              vec<slp_oprnd_info> *oprnds_info)
 553 {
 554   stmt_vec_info stmt_info = stmts[stmt_num];
 555   tree oprnd;
 556   unsigned int i, number_of_oprnds;
 557   enum vect_def_type dt = vect_uninitialized_def;
 558   slp_oprnd_info oprnd_info;
 559   unsigned int commutative_op = -1U;
 560   bool first = stmt_num == 0;
 561
 562   if (!is_a<gcall *> (stmt_info->stmt)
 563       && !is_a<gassign *> (stmt_info->stmt)
 564       && !is_a<gphi *> (stmt_info->stmt))
 565     return -1;
 566
 567   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 568   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 569   if (map)
 570     number_of_oprnds = *map++;
 571   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 572     {
 573       if (gimple_call_internal_p (stmt))
 574         {
 575           internal_fn ifn = gimple_call_internal_fn (stmt);
 576           commutative_op = first_commutative_argument (ifn);
 577         }
 578     }
 579   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 580     {
 581       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 582         commutative_op = 0;
 583     }
 584
 585   bool swapped = (swap != 0);
 586   bool backedge = false;
 587   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 588   for (i = 0; i < number_of_oprnds; i++)
 589     {
 590       int opno = map ? map[i] : int (i);
 591       if (opno < 0)
 592         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 593       else
 594         {
 595           oprnd = gimple_arg (stmt_info->stmt, opno);
 596           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 597             backedge = dominated_by_p (CDI_DOMINATORS,
 598                                        gimple_phi_arg_edge (stmt, opno)->src,
 599                                        gimple_bb (stmt_info->stmt));
 600         }
 601       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 602         oprnd = TREE_OPERAND (oprnd, 0);
 603
 604       oprnd_info = (*oprnds_info)[i];
 605
 606       stmt_vec_info def_stmt_info;
 607       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 608         {
 609           if (dump_enabled_p ())
 610             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 611                              "Build SLP failed: can't analyze def for %T\n",
 612                              oprnd);
 613
 614           return -1;
 615         }
 616
 617       if (skip_args[i])
 618         {
 619           oprnd_info->def_stmts.quick_push (NULL);
 620           oprnd_info->ops.quick_push (NULL_TREE);
 621           oprnd_info->first_dt = vect_uninitialized_def;
 622           continue;
 623         }
 624
 625       oprnd_info->def_stmts.quick_push (def_stmt_info);
 626       oprnd_info->ops.quick_push (oprnd);
 627
 628       if (def_stmt_info
 629           && is_pattern_stmt_p (def_stmt_info))
 630         {
 631           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 632               != def_stmt_info)
 633             oprnd_info->any_pattern = true;
 634           else
 635             /* If we promote this to external use the original stmt def.  */
 636             oprnd_info->ops.last ()
 637               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 638         }
 639
 640       /* If there's a extern def on a backedge make sure we can
 641          code-generate at the region start.
 642          ???  This is another case that could be fixed by adjusting
 643          how we split the function but at the moment we'd have conflicting
 644          goals there.  */
 645       if (backedge
 646           && dts[i] == vect_external_def
 647           && is_a <bb_vec_info> (vinfo)
 648           && TREE_CODE (oprnd) == SSA_NAME
 649           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 650           && !dominated_by_p (CDI_DOMINATORS,
 651                               as_a <bb_vec_info> (vinfo)->bbs[0],
 652                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 653         {
 654           if (dump_enabled_p ())
 655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 656                              "Build SLP failed: extern def %T only defined "
 657                              "on backedge\n", oprnd);
 658           return -1;
 659         }
 660
 661       if (first)
 662         {
 663           tree type = TREE_TYPE (oprnd);
 664           dt = dts[i];
 665           if ((dt == vect_constant_def
 666                || dt == vect_external_def)
 667               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 668               && (TREE_CODE (type) == BOOLEAN_TYPE
 669                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 670                                                       type)))
 671             {
 672               if (dump_enabled_p ())
 673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 674                                  "Build SLP failed: invalid type of def "
 675                                  "for variable-length SLP %T\n", oprnd);
 676               return -1;
 677             }
 678
 679           /* For the swapping logic below force vect_reduction_def
 680              for the reduction op in a SLP reduction group.  */
 681           if (!STMT_VINFO_DATA_REF (stmt_info)
 682               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 683               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 684               && def_stmt_info)
 685             dts[i] = dt = vect_reduction_def;
 686
 687           /* Check the types of the definition.  */
 688           switch (dt)
 689             {
 690             case vect_external_def:
 691             case vect_constant_def:
 692             case vect_internal_def:
 693             case vect_reduction_def:
 694             case vect_induction_def:
 695             case vect_nested_cycle:
 696             case vect_first_order_recurrence:
 697               break;
 698
 699             default:
 700               /* FORNOW: Not supported.  */
 701               if (dump_enabled_p ())
 702                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 703                                  "Build SLP failed: illegal type of def %T\n",
 704                                  oprnd);
 705               return -1;
 706             }
 707
 708           oprnd_info->first_dt = dt;
 709           oprnd_info->first_op_type = type;
 710         }
 711     }
 712   if (first)
 713     return 0;
 714
 715   /* Now match the operand definition types to that of the first stmt.  */
 716   for (i = 0; i < number_of_oprnds;)
 717     {
 718       if (skip_args[i])
 719         {
 720           ++i;
 721           continue;
 722         }
 723
 724       oprnd_info = (*oprnds_info)[i];
 725       dt = dts[i];
 726       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 727       oprnd = oprnd_info->ops[stmt_num];
 728       tree type = TREE_TYPE (oprnd);
 729
 730       if (!types_compatible_p (oprnd_info->first_op_type, type))
 731         {
 732           if (dump_enabled_p ())
 733             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 734                              "Build SLP failed: different operand types\n");
 735           return 1;
 736         }
 737
 738       /* Not first stmt of the group, check that the def-stmt/s match
 739          the def-stmt/s of the first stmt.  Allow different definition
 740          types for reduction chains: the first stmt must be a
 741          vect_reduction_def (a phi node), and the rest
 742          end in the reduction chain.  */
 743       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 744            && !(oprnd_info->first_dt == vect_reduction_def
 745                 && !STMT_VINFO_DATA_REF (stmt_info)
 746                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 747                 && def_stmt_info
 748                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 749                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 750                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 751           || (!STMT_VINFO_DATA_REF (stmt_info)
 752               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 753               && ((!def_stmt_info
 754                    || STMT_VINFO_DATA_REF (def_stmt_info)
 755                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 756                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 757                   != (oprnd_info->first_dt != vect_reduction_def))))
 758         {
 759           /* Try swapping operands if we got a mismatch.  For BB
 760              vectorization only in case it will clearly improve things.  */
 761           if (i == commutative_op && !swapped
 762               && (!is_a <bb_vec_info> (vinfo)
 763                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 764                                              dts[i+1])
 765                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 766                           || vect_def_types_match
 767                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 768             {
 769               if (dump_enabled_p ())
 770                 dump_printf_loc (MSG_NOTE, vect_location,
 771                                  "trying swapped operands\n");
 772               std::swap (dts[i], dts[i+1]);
 773               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 774                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 775               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 776                          (*oprnds_info)[i+1]->ops[stmt_num]);
 777               swapped = true;
 778               continue;
 779             }
 780
 781           if (is_a <bb_vec_info> (vinfo)
 782               && !oprnd_info->any_pattern)
 783             {
 784               /* Now for commutative ops we should see whether we can
 785                  make the other operand matching.  */
 786               if (dump_enabled_p ())
 787                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 788                                  "treating operand as external\n");
 789               oprnd_info->first_dt = dt = vect_external_def;
 790             }
 791           else
 792             {
 793               if (dump_enabled_p ())
 794                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 795                                  "Build SLP failed: different types\n");
 796               return 1;
 797             }
 798         }
 799
 800       /* Make sure to demote the overall operand to external.  */
 801       if (dt == vect_external_def)
 802         oprnd_info->first_dt = vect_external_def;
 803       /* For a SLP reduction chain we want to duplicate the reduction to
 804          each of the chain members.  That gets us a sane SLP graph (still
 805          the stmts are not 100% correct wrt the initial values).  */
 806       else if ((dt == vect_internal_def
 807                 || dt == vect_reduction_def)
 808                && oprnd_info->first_dt == vect_reduction_def
 809                && !STMT_VINFO_DATA_REF (stmt_info)
 810                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 811                && !STMT_VINFO_DATA_REF (def_stmt_info)
 812                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 813                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 814         {
 815           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 816           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 817         }
 818
 819       ++i;
 820     }
 821
 822   /* Swap operands.  */
 823   if (swapped)
 824     {
 825       if (dump_enabled_p ())
 826         dump_printf_loc (MSG_NOTE, vect_location,
 827                          "swapped operands to match def types in %G",
 828                          stmt_info->stmt);
 829     }
 830
 831   return 0;
 832 }
 833
 834 /* Return true if call statements CALL1 and CALL2 are similar enough
 835    to be combined into the same SLP group.  */
 836
 837 bool
 838 compatible_calls_p (gcall *call1, gcall *call2)
 839 {
 840   unsigned int nargs = gimple_call_num_args (call1);
 841   if (nargs != gimple_call_num_args (call2))
 842     return false;
 843
 844   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 845     return false;
 846
 847   if (gimple_call_internal_p (call1))
 848     {
 849       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 850                                TREE_TYPE (gimple_call_lhs (call2))))
 851         return false;
 852       for (unsigned int i = 0; i < nargs; ++i)
 853         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 854                                  TREE_TYPE (gimple_call_arg (call2, i))))
 855           return false;
 856     }
 857   else
 858     {
 859       if (!operand_equal_p (gimple_call_fn (call1),
 860                             gimple_call_fn (call2), 0))
 861         return false;
 862
 863       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 864         return false;
 865     }
 866
 867   /* Check that any unvectorized arguments are equal.  */
 868   if (const int *map = vect_get_operand_map (call1))
 869     {
 870       unsigned int nkept = *map++;
 871       unsigned int mapi = 0;
 872       for (unsigned int i = 0; i < nargs; ++i)
 873         if (mapi < nkept && map[mapi] == int (i))
 874           mapi += 1;
 875         else if (!operand_equal_p (gimple_call_arg (call1, i),
 876                                    gimple_call_arg (call2, i)))
 877           return false;
 878     }
 879
 880   return true;
 881 }
 882
 883 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 884    caller's attempt to find the vector type in STMT_INFO with the narrowest
 885    element type.  Return true if VECTYPE is nonnull and if it is valid
 886    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 887    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 888    vect_build_slp_tree.  */
 889
 890 static bool
 891 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 892                         unsigned int group_size,
 893                         tree vectype, poly_uint64 *max_nunits)
 894 {
 895   if (!vectype)
 896     {
 897       if (dump_enabled_p ())
 898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 899                          "Build SLP failed: unsupported data-type in %G\n",
 900                          stmt_info->stmt);
 901       /* Fatal mismatch.  */
 902       return false;
 903     }
 904
 905   /* If populating the vector type requires unrolling then fail
 906      before adjusting *max_nunits for basic-block vectorization.  */
 907   if (is_a <bb_vec_info> (vinfo)
 908       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 909     {
 910       if (dump_enabled_p ())
 911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                          "Build SLP failed: unrolling required "
 913                          "in basic block SLP\n");
 914       /* Fatal mismatch.  */
 915       return false;
 916     }
 917
 918   /* In case of multiple types we need to detect the smallest type.  */
 919   vect_update_max_nunits (max_nunits, vectype);
 920   return true;
 921 }
 922
 923 /* Verify if the scalar stmts STMTS are isomorphic, require data
 924    permutation or are of unsupported types of operation.  Return
 925    true if they are, otherwise return false and indicate in *MATCHES
 926    which stmts are not isomorphic to the first one.  If MATCHES[0]
 927    is false then this indicates the comparison could not be
 928    carried out or the stmts will never be vectorized by SLP.
 929
 930    Note COND_EXPR is possibly isomorphic to another one after swapping its
 931    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 932    the first stmt by swapping the two operands of comparison; set SWAP[i]
 933    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 934    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 935    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 936
 937 static bool
 938 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 939                        vec<stmt_vec_info> stmts, unsigned int group_size,
 940                        poly_uint64 *max_nunits, bool *matches,
 941                        bool *two_operators, tree *node_vectype)
 942 {
 943   unsigned int i;
 944   stmt_vec_info first_stmt_info = stmts[0];
 945   code_helper first_stmt_code = ERROR_MARK;
 946   code_helper alt_stmt_code = ERROR_MARK;
 947   code_helper rhs_code = ERROR_MARK;
 948   code_helper first_cond_code = ERROR_MARK;
 949   tree lhs;
 950   bool need_same_oprnds = false;
 951   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 952   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 953   bool first_stmt_load_p = false, load_p = false;
 954   bool first_stmt_phi_p = false, phi_p = false;
 955   bool maybe_soft_fail = false;
 956   tree soft_fail_nunits_vectype = NULL_TREE;
 957
 958   /* For every stmt in NODE find its def stmt/s.  */
 959   stmt_vec_info stmt_info;
 960   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 961     {
 962       gimple *stmt = stmt_info->stmt;
 963       swap[i] = 0;
 964       matches[i] = false;
 965
 966       if (dump_enabled_p ())
 967         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 968
 969       /* Fail to vectorize statements marked as unvectorizable, throw
 970          or are volatile.  */
 971       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
 972           || stmt_can_throw_internal (cfun, stmt)
 973           || gimple_has_volatile_ops (stmt))
 974         {
 975           if (dump_enabled_p ())
 976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 977                              "Build SLP failed: unvectorizable statement %G",
 978                              stmt);
 979           /* ???  For BB vectorization we want to commutate operands in a way
 980              to shuffle all unvectorizable defs into one operand and have
 981              the other still vectorized.  The following doesn't reliably
 982              work for this though but it's the easiest we can do here.  */
 983           if (is_a <bb_vec_info> (vinfo) && i != 0)
 984             continue;
 985           /* Fatal mismatch.  */
 986           matches[0] = false;
 987           return false;
 988         }
 989
 990       lhs = gimple_get_lhs (stmt);
 991       if (lhs == NULL_TREE)
 992         {
 993           if (dump_enabled_p ())
 994             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 995                              "Build SLP failed: not GIMPLE_ASSIGN nor "
 996                              "GIMPLE_CALL %G", stmt);
 997           if (is_a <bb_vec_info> (vinfo) && i != 0)
 998             continue;
 999           /* Fatal mismatch.  */
1000           matches[0] = false;
1001           return false;
1002         }
1003
1004       tree nunits_vectype;
1005       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1006                                            &nunits_vectype, group_size))
1007         {
1008           if (is_a <bb_vec_info> (vinfo) && i != 0)
1009             continue;
1010           /* Fatal mismatch.  */
1011           matches[0] = false;
1012           return false;
1013         }
1014       /* Record nunits required but continue analysis, producing matches[]
1015          as if nunits was not an issue.  This allows splitting of groups
1016          to happen.  */
1017       if (nunits_vectype
1018           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1019                                       nunits_vectype, max_nunits))
1020         {
1021           gcc_assert (is_a <bb_vec_info> (vinfo));
1022           maybe_soft_fail = true;
1023           soft_fail_nunits_vectype = nunits_vectype;
1024         }
1025
1026       gcc_assert (vectype);
1027
1028       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1029       if (call_stmt)
1030         {
1031           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1032           if (cfn != CFN_LAST)
1033             rhs_code = cfn;
1034           else
1035             rhs_code = CALL_EXPR;
1036
1037           if (cfn == CFN_MASK_LOAD
1038               || cfn == CFN_GATHER_LOAD
1039               || cfn == CFN_MASK_GATHER_LOAD)
1040             load_p = true;
1041           else if ((internal_fn_p (cfn)
1042                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1043                    || gimple_call_tail_p (call_stmt)
1044                    || gimple_call_noreturn_p (call_stmt)
1045                    || gimple_call_chain (call_stmt))
1046             {
1047               if (dump_enabled_p ())
1048                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049                                  "Build SLP failed: unsupported call type %G",
1050                                  (gimple *) call_stmt);
1051               if (is_a <bb_vec_info> (vinfo) && i != 0)
1052                 continue;
1053               /* Fatal mismatch.  */
1054               matches[0] = false;
1055               return false;
1056             }
1057         }
1058       else if (gimple_code (stmt) == GIMPLE_PHI)
1059         {
1060           rhs_code = ERROR_MARK;
1061           phi_p = true;
1062         }
1063       else
1064         {
1065           rhs_code = gimple_assign_rhs_code (stmt);
1066           load_p = gimple_vuse (stmt);
1067         }
1068
1069       /* Check the operation.  */
1070       if (i == 0)
1071         {
1072           *node_vectype = vectype;
1073           first_stmt_code = rhs_code;
1074           first_stmt_load_p = load_p;
1075           first_stmt_phi_p = phi_p;
1076
1077           /* Shift arguments should be equal in all the packed stmts for a
1078              vector shift with scalar shift operand.  */
1079           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1080               || rhs_code == LROTATE_EXPR
1081               || rhs_code == RROTATE_EXPR)
1082             {
1083               /* First see if we have a vector/vector shift.  */
1084               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1085                 {
1086                   /* No vector/vector shift, try for a vector/scalar shift.  */
1087                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1088                     {
1089                       if (dump_enabled_p ())
1090                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                                          "Build SLP failed: "
1092                                          "op not supported by target.\n");
1093                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1094                         continue;
1095                       /* Fatal mismatch.  */
1096                       matches[0] = false;
1097                       return false;
1098                     }
1099                   need_same_oprnds = true;
1100                   first_op1 = gimple_assign_rhs2 (stmt);
1101                 }
1102             }
1103           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1104             {
1105               need_same_oprnds = true;
1106               first_op1 = gimple_assign_rhs2 (stmt);
1107             }
1108           else if (!load_p
1109                    && rhs_code == BIT_FIELD_REF)
1110             {
1111               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1112               if (!is_a <bb_vec_info> (vinfo)
1113                   || TREE_CODE (vec) != SSA_NAME
1114                   /* When the element types are not compatible we pun the
1115                      source to the target vectype which requires equal size.  */
1116                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1117                        || !types_compatible_p (TREE_TYPE (vectype),
1118                                                TREE_TYPE (TREE_TYPE (vec))))
1119                       && !operand_equal_p (TYPE_SIZE (vectype),
1120                                            TYPE_SIZE (TREE_TYPE (vec)))))
1121                 {
1122                   if (dump_enabled_p ())
1123                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124                                      "Build SLP failed: "
1125                                      "BIT_FIELD_REF not supported\n");
1126                   /* Fatal mismatch.  */
1127                   matches[0] = false;
1128                   return false;
1129                 }
1130             }
1131           else if (rhs_code == CFN_DIV_POW2)
1132             {
1133               need_same_oprnds = true;
1134               first_op1 = gimple_call_arg (call_stmt, 1);
1135             }
1136         }
1137       else
1138         {
1139           if (first_stmt_code != rhs_code
1140               && alt_stmt_code == ERROR_MARK)
1141             alt_stmt_code = rhs_code;
1142           if ((first_stmt_code != rhs_code
1143                && (first_stmt_code != IMAGPART_EXPR
1144                    || rhs_code != REALPART_EXPR)
1145                && (first_stmt_code != REALPART_EXPR
1146                    || rhs_code != IMAGPART_EXPR)
1147                /* Handle mismatches in plus/minus by computing both
1148                   and merging the results.  */
1149                && !((first_stmt_code == PLUS_EXPR
1150                      || first_stmt_code == MINUS_EXPR)
1151                     && (alt_stmt_code == PLUS_EXPR
1152                         || alt_stmt_code == MINUS_EXPR)
1153                     && rhs_code == alt_stmt_code)
1154                && !(first_stmt_code.is_tree_code ()
1155                     && rhs_code.is_tree_code ()
1156                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1157                         == tcc_comparison)
1158                     && (swap_tree_comparison (tree_code (first_stmt_code))
1159                         == tree_code (rhs_code)))
1160                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1161                     && (first_stmt_code == ARRAY_REF
1162                         || first_stmt_code == BIT_FIELD_REF
1163                         || first_stmt_code == INDIRECT_REF
1164                         || first_stmt_code == COMPONENT_REF
1165                         || first_stmt_code == MEM_REF)
1166                     && (rhs_code == ARRAY_REF
1167                         || rhs_code == BIT_FIELD_REF
1168                         || rhs_code == INDIRECT_REF
1169                         || rhs_code == COMPONENT_REF
1170                         || rhs_code == MEM_REF)))
1171               || first_stmt_load_p != load_p
1172               || first_stmt_phi_p != phi_p)
1173             {
1174               if (dump_enabled_p ())
1175                 {
1176                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                                    "Build SLP failed: different operation "
1178                                    "in stmt %G", stmt);
1179                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                                    "original stmt %G", first_stmt_info->stmt);
1181                 }
1182               /* Mismatch.  */
1183               continue;
1184             }
1185
1186           if (!load_p
1187               && first_stmt_code == BIT_FIELD_REF
1188               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1189                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1190             {
1191               if (dump_enabled_p ())
1192                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                                  "Build SLP failed: different BIT_FIELD_REF "
1194                                  "arguments in %G", stmt);
1195               /* Mismatch.  */
1196               continue;
1197             }
1198
1199           if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1200             {
1201               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1202                                        call_stmt))
1203                 {
1204                   if (dump_enabled_p ())
1205                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206                                      "Build SLP failed: different calls in %G",
1207                                      stmt);
1208                   /* Mismatch.  */
1209                   continue;
1210                 }
1211             }
1212
1213           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1214               && (gimple_bb (first_stmt_info->stmt)
1215                   != gimple_bb (stmt_info->stmt)))
1216             {
1217               if (dump_enabled_p ())
1218                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                                  "Build SLP failed: different BB for PHI "
1220                                  "or possibly trapping operation in %G", stmt);
1221               /* Mismatch.  */
1222               continue;
1223             }
1224
1225           if (need_same_oprnds)
1226             {
1227               tree other_op1 = gimple_arg (stmt, 1);
1228               if (!operand_equal_p (first_op1, other_op1, 0))
1229                 {
1230                   if (dump_enabled_p ())
1231                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                                      "Build SLP failed: different shift "
1233                                      "arguments in %G", stmt);
1234                   /* Mismatch.  */
1235                   continue;
1236                 }
1237             }
1238
1239           if (!types_compatible_p (vectype, *node_vectype))
1240             {
1241               if (dump_enabled_p ())
1242                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                                  "Build SLP failed: different vector type "
1244                                  "in %G", stmt);
1245               /* Mismatch.  */
1246               continue;
1247             }
1248         }
1249
1250       /* Grouped store or load.  */
1251       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1252         {
1253           if (REFERENCE_CLASS_P (lhs))
1254             {
1255               /* Store.  */
1256               ;
1257             }
1258           else
1259             {
1260               /* Load.  */
1261               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1262               if (prev_first_load)
1263                 {
1264                   /* Check that there are no loads from different interleaving
1265                      chains in the same node.  */
1266                   if (prev_first_load != first_load)
1267                     {
1268                       if (dump_enabled_p ())
1269                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1270                                          vect_location,
1271                                          "Build SLP failed: different "
1272                                          "interleaving chains in one node %G",
1273                                          stmt);
1274                       /* Mismatch.  */
1275                       continue;
1276                     }
1277                 }
1278               else
1279                 prev_first_load = first_load;
1280            }
1281         } /* Grouped access.  */
1282       else
1283         {
1284           if (load_p
1285               && rhs_code != CFN_GATHER_LOAD
1286               && rhs_code != CFN_MASK_GATHER_LOAD)
1287             {
1288               /* Not grouped load.  */
1289               if (dump_enabled_p ())
1290                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                                  "Build SLP failed: not grouped load %G", stmt);
1292
1293               /* FORNOW: Not grouped loads are not supported.  */
1294               if (is_a <bb_vec_info> (vinfo) && i != 0)
1295                 continue;
1296               /* Fatal mismatch.  */
1297               matches[0] = false;
1298               return false;
1299             }
1300
1301           /* Not memory operation.  */
1302           if (!phi_p
1303               && rhs_code.is_tree_code ()
1304               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1305               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1306               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1307               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1308               && rhs_code != VIEW_CONVERT_EXPR
1309               && rhs_code != CALL_EXPR
1310               && rhs_code != BIT_FIELD_REF)
1311             {
1312               if (dump_enabled_p ())
1313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                                  "Build SLP failed: operation unsupported %G",
1315                                  stmt);
1316               if (is_a <bb_vec_info> (vinfo) && i != 0)
1317                 continue;
1318               /* Fatal mismatch.  */
1319               matches[0] = false;
1320               return false;
1321             }
1322
1323           if (rhs_code == COND_EXPR)
1324             {
1325               tree cond_expr = gimple_assign_rhs1 (stmt);
1326               enum tree_code cond_code = TREE_CODE (cond_expr);
1327               enum tree_code swap_code = ERROR_MARK;
1328               enum tree_code invert_code = ERROR_MARK;
1329
1330               if (i == 0)
1331                 first_cond_code = TREE_CODE (cond_expr);
1332               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1333                 {
1334                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1335                   swap_code = swap_tree_comparison (cond_code);
1336                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1337                 }
1338
1339               if (first_cond_code == cond_code)
1340                 ;
1341               /* Isomorphic can be achieved by swapping.  */
1342               else if (first_cond_code == swap_code)
1343                 swap[i] = 1;
1344               /* Isomorphic can be achieved by inverting.  */
1345               else if (first_cond_code == invert_code)
1346                 swap[i] = 2;
1347               else
1348                 {
1349                   if (dump_enabled_p ())
1350                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351                                      "Build SLP failed: different"
1352                                      " operation %G", stmt);
1353                   /* Mismatch.  */
1354                   continue;
1355                 }
1356             }
1357
1358           if (rhs_code.is_tree_code ()
1359               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1360               && (swap_tree_comparison ((tree_code)first_stmt_code)
1361                   == (tree_code)rhs_code))
1362             swap[i] = 1;
1363         }
1364
1365       matches[i] = true;
1366     }
1367
1368   for (i = 0; i < group_size; ++i)
1369     if (!matches[i])
1370       return false;
1371
1372   /* If we allowed a two-operation SLP node verify the target can cope
1373      with the permute we are going to use.  */
1374   if (alt_stmt_code != ERROR_MARK
1375       && (!alt_stmt_code.is_tree_code ()
1376           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1377               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1378     {
1379       *two_operators = true;
1380     }
1381
1382   if (maybe_soft_fail)
1383     {
1384       unsigned HOST_WIDE_INT const_nunits;
1385       if (!TYPE_VECTOR_SUBPARTS
1386             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1387           || const_nunits > group_size)
1388         matches[0] = false;
1389       else
1390         {
1391           /* With constant vector elements simulate a mismatch at the
1392              point we need to split.  */
1393           unsigned tail = group_size & (const_nunits - 1);
1394           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1395         }
1396       return false;
1397     }
1398
1399   return true;
1400 }
1401
1402 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1403    Note we never remove apart from at destruction time so we do not
1404    need a special value for deleted that differs from empty.  */
1405 struct bst_traits
1406 {
1407   typedef vec <stmt_vec_info> value_type;
1408   typedef vec <stmt_vec_info> compare_type;
1409   static inline hashval_t hash (value_type);
1410   static inline bool equal (value_type existing, value_type candidate);
1411   static inline bool is_empty (value_type x) { return !x.exists (); }
1412   static inline bool is_deleted (value_type x) { return !x.exists (); }
1413   static const bool empty_zero_p = true;
1414   static inline void mark_empty (value_type &x) { x.release (); }
1415   static inline void mark_deleted (value_type &x) { x.release (); }
1416   static inline void remove (value_type &x) { x.release (); }
1417 };
1418 inline hashval_t
1419 bst_traits::hash (value_type x)
1420 {
1421   inchash::hash h;
1422   for (unsigned i = 0; i < x.length (); ++i)
1423     h.add_int (gimple_uid (x[i]->stmt));
1424   return h.end ();
1425 }
1426 inline bool
1427 bst_traits::equal (value_type existing, value_type candidate)
1428 {
1429   if (existing.length () != candidate.length ())
1430     return false;
1431   for (unsigned i = 0; i < existing.length (); ++i)
1432     if (existing[i] != candidate[i])
1433       return false;
1434   return true;
1435 }
1436
1437 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1438    but then vec::insert does memmove and that's not compatible with
1439    std::pair.  */
1440 struct chain_op_t
1441 {
1442   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1443       : code (code_), dt (dt_), op (op_) {}
1444   tree_code code;
1445   vect_def_type dt;
1446   tree op;
1447 };
1448
1449 /* Comparator for sorting associatable chains.  */
1450
1451 static int
1452 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1453 {
1454   auto *op1 = (const chain_op_t *) op1_;
1455   auto *op2 = (const chain_op_t *) op2_;
1456   if (op1->dt != op2->dt)
1457     return (int)op1->dt - (int)op2->dt;
1458   return (int)op1->code - (int)op2->code;
1459 }
1460
1461 /* Linearize the associatable expression chain at START with the
1462    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1463    filling CHAIN with the result and using WORKLIST as intermediate storage.
1464    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1465    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1466    stmts, starting with START.  */
1467
1468 static void
1469 vect_slp_linearize_chain (vec_info *vinfo,
1470                           vec<std::pair<tree_code, gimple *> > &worklist,
1471                           vec<chain_op_t> &chain,
1472                           enum tree_code code, gimple *start,
1473                           gimple *&code_stmt, gimple *&alt_code_stmt,
1474                           vec<gimple *> *chain_stmts)
1475 {
1476   /* For each lane linearize the addition/subtraction (or other
1477      uniform associatable operation) expression tree.  */
1478   worklist.safe_push (std::make_pair (code, start));
1479   while (!worklist.is_empty ())
1480     {
1481       auto entry = worklist.pop ();
1482       gassign *stmt = as_a <gassign *> (entry.second);
1483       enum tree_code in_code = entry.first;
1484       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1485       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1486       if (!code_stmt
1487           && gimple_assign_rhs_code (stmt) == code)
1488         code_stmt = stmt;
1489       else if (!alt_code_stmt
1490                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1491         alt_code_stmt = stmt;
1492       if (chain_stmts)
1493         chain_stmts->safe_push (stmt);
1494       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1495         {
1496           tree op = gimple_op (stmt, opnum);
1497           vect_def_type dt;
1498           stmt_vec_info def_stmt_info;
1499           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1500           gcc_assert (res);
1501           if (dt == vect_internal_def
1502               && is_pattern_stmt_p (def_stmt_info))
1503             op = gimple_get_lhs (def_stmt_info->stmt);
1504           gimple *use_stmt;
1505           use_operand_p use_p;
1506           if (dt == vect_internal_def
1507               && single_imm_use (op, &use_p, &use_stmt)
1508               && is_gimple_assign (def_stmt_info->stmt)
1509               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1510                   || (code == PLUS_EXPR
1511                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1512                           == MINUS_EXPR))))
1513             {
1514               tree_code op_def_code = this_code;
1515               if (op_def_code == MINUS_EXPR && opnum == 1)
1516                 op_def_code = PLUS_EXPR;
1517               if (in_code == MINUS_EXPR)
1518                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1519               worklist.safe_push (std::make_pair (op_def_code,
1520                                                   def_stmt_info->stmt));
1521             }
1522           else
1523             {
1524               tree_code op_def_code = this_code;
1525               if (op_def_code == MINUS_EXPR && opnum == 1)
1526                 op_def_code = PLUS_EXPR;
1527               if (in_code == MINUS_EXPR)
1528                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1529               chain.safe_push (chain_op_t (op_def_code, dt, op));
1530             }
1531         }
1532     }
1533 }
1534
1535 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1536                   simple_hashmap_traits <bst_traits, slp_tree> >
1537   scalar_stmts_to_slp_tree_map_t;
1538
1539 static slp_tree
1540 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1541                        vec<stmt_vec_info> stmts, unsigned int group_size,
1542                        poly_uint64 *max_nunits,
1543                        bool *matches, unsigned *limit, unsigned *tree_size,
1544                        scalar_stmts_to_slp_tree_map_t *bst_map);
1545
1546 static slp_tree
1547 vect_build_slp_tree (vec_info *vinfo,
1548                      vec<stmt_vec_info> stmts, unsigned int group_size,
1549                      poly_uint64 *max_nunits,
1550                      bool *matches, unsigned *limit, unsigned *tree_size,
1551                      scalar_stmts_to_slp_tree_map_t *bst_map)
1552 {
1553   if (slp_tree *leader = bst_map->get (stmts))
1554     {
1555       if (dump_enabled_p ())
1556         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1557                          !(*leader)->failed ? "" : "failed ",
1558                          (void *) *leader);
1559       if (!(*leader)->failed)
1560         {
1561           SLP_TREE_REF_COUNT (*leader)++;
1562           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1563           stmts.release ();
1564           return *leader;
1565         }
1566       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1567       return NULL;
1568     }
1569
1570   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1571      so we can pick up backedge destinations during discovery.  */
1572   slp_tree res = new _slp_tree;
1573   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1574   SLP_TREE_SCALAR_STMTS (res) = stmts;
1575   bst_map->put (stmts.copy (), res);
1576
1577   if (*limit == 0)
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "SLP discovery limit exceeded\n");
1582       /* Mark the node invalid so we can detect those when still in use
1583          as backedge destinations.  */
1584       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1585       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1586       res->failed = XNEWVEC (bool, group_size);
1587       memset (res->failed, 0, sizeof (bool) * group_size);
1588       memset (matches, 0, sizeof (bool) * group_size);
1589       return NULL;
1590     }
1591   --*limit;
1592
1593   if (dump_enabled_p ())
1594     dump_printf_loc (MSG_NOTE, vect_location,
1595                      "starting SLP discovery for node %p\n", (void *) res);
1596
1597   poly_uint64 this_max_nunits = 1;
1598   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1599                                         &this_max_nunits,
1600                                         matches, limit, tree_size, bst_map);
1601   if (!res_)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_NOTE, vect_location,
1605                          "SLP discovery for node %p failed\n", (void *) res);
1606       /* Mark the node invalid so we can detect those when still in use
1607          as backedge destinations.  */
1608       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1609       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1610       res->failed = XNEWVEC (bool, group_size);
1611       if (flag_checking)
1612         {
1613           unsigned i;
1614           for (i = 0; i < group_size; ++i)
1615             if (!matches[i])
1616               break;
1617           gcc_assert (i < group_size);
1618         }
1619       memcpy (res->failed, matches, sizeof (bool) * group_size);
1620     }
1621   else
1622     {
1623       if (dump_enabled_p ())
1624         dump_printf_loc (MSG_NOTE, vect_location,
1625                          "SLP discovery for node %p succeeded\n",
1626                          (void *) res);
1627       gcc_assert (res_ == res);
1628       res->max_nunits = this_max_nunits;
1629       vect_update_max_nunits (max_nunits, this_max_nunits);
1630       /* Keep a reference for the bst_map use.  */
1631       SLP_TREE_REF_COUNT (res)++;
1632     }
1633   return res_;
1634 }
1635
1636 /* Helper for building an associated SLP node chain.  */
1637
1638 static void
1639 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1640                                    slp_tree op0, slp_tree op1,
1641                                    stmt_vec_info oper1, stmt_vec_info oper2,
1642                                    vec<std::pair<unsigned, unsigned> > lperm)
1643 {
1644   unsigned group_size = SLP_TREE_LANES (op1);
1645
1646   slp_tree child1 = new _slp_tree;
1647   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1648   SLP_TREE_VECTYPE (child1) = vectype;
1649   SLP_TREE_LANES (child1) = group_size;
1650   SLP_TREE_CHILDREN (child1).create (2);
1651   SLP_TREE_CHILDREN (child1).quick_push (op0);
1652   SLP_TREE_CHILDREN (child1).quick_push (op1);
1653   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1654
1655   slp_tree child2 = new _slp_tree;
1656   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1657   SLP_TREE_VECTYPE (child2) = vectype;
1658   SLP_TREE_LANES (child2) = group_size;
1659   SLP_TREE_CHILDREN (child2).create (2);
1660   SLP_TREE_CHILDREN (child2).quick_push (op0);
1661   SLP_TREE_REF_COUNT (op0)++;
1662   SLP_TREE_CHILDREN (child2).quick_push (op1);
1663   SLP_TREE_REF_COUNT (op1)++;
1664   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1665
1666   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1667   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1668   SLP_TREE_VECTYPE (perm) = vectype;
1669   SLP_TREE_LANES (perm) = group_size;
1670   /* ???  We should set this NULL but that's not expected.  */
1671   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1672   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1673   SLP_TREE_CHILDREN (perm).quick_push (child1);
1674   SLP_TREE_CHILDREN (perm).quick_push (child2);
1675 }
1676
1677 /* Recursively build an SLP tree starting from NODE.
1678    Fail (and return a value not equal to zero) if def-stmts are not
1679    isomorphic, require data permutation or are of unsupported types of
1680    operation.  Otherwise, return 0.
1681    The value returned is the depth in the SLP tree where a mismatch
1682    was found.  */
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map)
1690 {
1691   unsigned nops, i, this_tree_size = 0;
1692   poly_uint64 this_max_nunits = *max_nunits;
1693
1694   matches[0] = false;
1695
1696   stmt_vec_info stmt_info = stmts[0];
1697   if (!is_a<gcall *> (stmt_info->stmt)
1698       && !is_a<gassign *> (stmt_info->stmt)
1699       && !is_a<gphi *> (stmt_info->stmt))
1700     return NULL;
1701
1702   nops = gimple_num_args (stmt_info->stmt);
1703   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1704     nops = map[0];
1705
1706   /* If the SLP node is a PHI (induction or reduction), terminate
1707      the recursion.  */
1708   bool *skip_args = XALLOCAVEC (bool, nops);
1709   memset (skip_args, 0, sizeof (bool) * nops);
1710   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1711     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1712       {
1713         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1714         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1715                                                     group_size);
1716         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1717                                      max_nunits))
1718           return NULL;
1719
1720         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1721         if (def_type == vect_induction_def)
1722           {
1723             /* Induction PHIs are not cycles but walk the initial
1724                value.  Only for inner loops through, for outer loops
1725                we need to pick up the value from the actual PHIs
1726                to more easily support peeling and epilogue vectorization.  */
1727             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1728             if (!nested_in_vect_loop_p (loop, stmt_info))
1729               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1730             else
1731               loop = loop->inner;
1732             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1733           }
1734         else if (def_type == vect_reduction_def
1735                  || def_type == vect_double_reduction_def
1736                  || def_type == vect_nested_cycle
1737                  || def_type == vect_first_order_recurrence)
1738           {
1739             /* Else def types have to match.  */
1740             stmt_vec_info other_info;
1741             bool all_same = true;
1742             FOR_EACH_VEC_ELT (stmts, i, other_info)
1743               {
1744                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1745                   return NULL;
1746                 if (other_info != stmt_info)
1747                   all_same = false;
1748               }
1749             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750             /* Reduction initial values are not explicitely represented.  */
1751             if (def_type != vect_first_order_recurrence
1752                 && !nested_in_vect_loop_p (loop, stmt_info))
1753               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1754             /* Reduction chain backedge defs are filled manually.
1755                ???  Need a better way to identify a SLP reduction chain PHI.
1756                Or a better overall way to SLP match those.  */
1757             if (all_same && def_type == vect_reduction_def)
1758               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1759           }
1760         else if (def_type != vect_internal_def)
1761           return NULL;
1762       }
1763
1764
1765   bool two_operators = false;
1766   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1767   tree vectype = NULL_TREE;
1768   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1769                               &this_max_nunits, matches, &two_operators,
1770                               &vectype))
1771     return NULL;
1772
1773   /* If the SLP node is a load, terminate the recursion unless masked.  */
1774   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1775       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1776     {
1777       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1778         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1779                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1780                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1781       else
1782         {
1783           *max_nunits = this_max_nunits;
1784           (*tree_size)++;
1785           node = vect_create_new_slp_node (node, stmts, 0);
1786           SLP_TREE_VECTYPE (node) = vectype;
1787           /* And compute the load permutation.  Whether it is actually
1788              a permutation depends on the unrolling factor which is
1789              decided later.  */
1790           vec<unsigned> load_permutation;
1791           int j;
1792           stmt_vec_info load_info;
1793           load_permutation.create (group_size);
1794           stmt_vec_info first_stmt_info
1795             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1796           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1797             {
1798               int load_place = vect_get_place_in_interleaving_chain
1799                   (load_info, first_stmt_info);
1800               gcc_assert (load_place != -1);
1801               load_permutation.safe_push (load_place);
1802             }
1803           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1804           return node;
1805         }
1806     }
1807   else if (gimple_assign_single_p (stmt_info->stmt)
1808            && !gimple_vuse (stmt_info->stmt)
1809            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1810     {
1811       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1812          the same SSA name vector of a compatible type to vectype.  */
1813       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1814       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1815       stmt_vec_info estmt_info;
1816       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1817         {
1818           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1819           tree bfref = gimple_assign_rhs1 (estmt);
1820           HOST_WIDE_INT lane;
1821           if (!known_eq (bit_field_size (bfref),
1822                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1823               || !constant_multiple_p (bit_field_offset (bfref),
1824                                        bit_field_size (bfref), &lane))
1825             {
1826               lperm.release ();
1827               matches[0] = false;
1828               return NULL;
1829             }
1830           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1831         }
1832       slp_tree vnode = vect_create_new_slp_node (vNULL);
1833       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1834         /* ???  We record vectype here but we hide eventually necessary
1835            punning and instead rely on code generation to materialize
1836            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1837            this explicit somehow.  */
1838         SLP_TREE_VECTYPE (vnode) = vectype;
1839       else
1840         {
1841           /* For different size but compatible elements we can still
1842              use VEC_PERM_EXPR without punning.  */
1843           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1844                       && types_compatible_p (TREE_TYPE (vectype),
1845                                              TREE_TYPE (TREE_TYPE (vec))));
1846           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1847         }
1848       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1849       unsigned HOST_WIDE_INT const_nunits;
1850       if (nunits.is_constant (&const_nunits))
1851         SLP_TREE_LANES (vnode) = const_nunits;
1852       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1853       /* We are always building a permutation node even if it is an identity
1854          permute to shield the rest of the vectorizer from the odd node
1855          representing an actual vector without any scalar ops.
1856          ???  We could hide it completely with making the permute node
1857          external?  */
1858       node = vect_create_new_slp_node (node, stmts, 1);
1859       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1860       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1861       SLP_TREE_VECTYPE (node) = vectype;
1862       SLP_TREE_CHILDREN (node).quick_push (vnode);
1863       return node;
1864     }
1865   /* When discovery reaches an associatable operation see whether we can
1866      improve that to match up lanes in a way superior to the operand
1867      swapping code which at most looks at two defs.
1868      ???  For BB vectorization we cannot do the brute-force search
1869      for matching as we can succeed by means of builds from scalars
1870      and have no good way to "cost" one build against another.  */
1871   else if (is_a <loop_vec_info> (vinfo)
1872            /* ???  We don't handle !vect_internal_def defs below.  */
1873            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1874            && is_gimple_assign (stmt_info->stmt)
1875            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1876                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1877            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1878                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1879                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1880     {
1881       /* See if we have a chain of (mixed) adds or subtracts or other
1882          associatable ops.  */
1883       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1884       if (code == MINUS_EXPR)
1885         code = PLUS_EXPR;
1886       stmt_vec_info other_op_stmt_info = NULL;
1887       stmt_vec_info op_stmt_info = NULL;
1888       unsigned chain_len = 0;
1889       auto_vec<chain_op_t> chain;
1890       auto_vec<std::pair<tree_code, gimple *> > worklist;
1891       auto_vec<vec<chain_op_t> > chains (group_size);
1892       auto_vec<slp_tree, 4> children;
1893       bool hard_fail = true;
1894       for (unsigned lane = 0; lane < group_size; ++lane)
1895         {
1896           /* For each lane linearize the addition/subtraction (or other
1897              uniform associatable operation) expression tree.  */
1898           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1899           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1900                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1901                                     NULL);
1902           if (!op_stmt_info && op_stmt)
1903             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1904           if (!other_op_stmt_info && other_op_stmt)
1905             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1906           if (chain.length () == 2)
1907             {
1908               /* In a chain of just two elements resort to the regular
1909                  operand swapping scheme.  If we run into a length
1910                  mismatch still hard-FAIL.  */
1911               if (chain_len == 0)
1912                 hard_fail = false;
1913               else
1914                 {
1915                   matches[lane] = false;
1916                   /* ???  We might want to process the other lanes, but
1917                      make sure to not give false matching hints to the
1918                      caller for lanes we did not process.  */
1919                   if (lane != group_size - 1)
1920                     matches[0] = false;
1921                 }
1922               break;
1923             }
1924           else if (chain_len == 0)
1925             chain_len = chain.length ();
1926           else if (chain.length () != chain_len)
1927             {
1928               /* ???  Here we could slip in magic to compensate with
1929                  neutral operands.  */
1930               matches[lane] = false;
1931               if (lane != group_size - 1)
1932                 matches[0] = false;
1933               break;
1934             }
1935           chains.quick_push (chain.copy ());
1936           chain.truncate (0);
1937         }
1938       if (chains.length () == group_size)
1939         {
1940           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1941           if (!op_stmt_info)
1942             {
1943               hard_fail = false;
1944               goto out;
1945             }
1946           /* Now we have a set of chains with the same length.  */
1947           /* 1. pre-sort according to def_type and operation.  */
1948           for (unsigned lane = 0; lane < group_size; ++lane)
1949             chains[lane].stablesort (dt_sort_cmp, vinfo);
1950           if (dump_enabled_p ())
1951             {
1952               dump_printf_loc (MSG_NOTE, vect_location,
1953                                "pre-sorted chains of %s\n",
1954                                get_tree_code_name (code));
1955               for (unsigned lane = 0; lane < group_size; ++lane)
1956                 {
1957                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1958                     dump_printf (MSG_NOTE, "%s %T ",
1959                                  get_tree_code_name (chains[lane][opnum].code),
1960                                  chains[lane][opnum].op);
1961                   dump_printf (MSG_NOTE, "\n");
1962                 }
1963             }
1964           /* 2. try to build children nodes, associating as necessary.  */
1965           for (unsigned n = 0; n < chain_len; ++n)
1966             {
1967               vect_def_type dt = chains[0][n].dt;
1968               unsigned lane;
1969               for (lane = 0; lane < group_size; ++lane)
1970                 if (chains[lane][n].dt != dt)
1971                   {
1972                     if (dt == vect_constant_def
1973                         && chains[lane][n].dt == vect_external_def)
1974                       dt = vect_external_def;
1975                     else if (dt == vect_external_def
1976                              && chains[lane][n].dt == vect_constant_def)
1977                       ;
1978                     else
1979                       break;
1980                   }
1981               if (lane != group_size)
1982                 {
1983                   if (dump_enabled_p ())
1984                     dump_printf_loc (MSG_NOTE, vect_location,
1985                                      "giving up on chain due to mismatched "
1986                                      "def types\n");
1987                   matches[lane] = false;
1988                   if (lane != group_size - 1)
1989                     matches[0] = false;
1990                   goto out;
1991                 }
1992               if (dt == vect_constant_def
1993                   || dt == vect_external_def)
1994                 {
1995                   /* Check whether we can build the invariant.  If we can't
1996                      we never will be able to.  */
1997                   tree type = TREE_TYPE (chains[0][n].op);
1998                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1999                       && (TREE_CODE (type) == BOOLEAN_TYPE
2000                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2001                                                               type)))
2002                     {
2003                       matches[0] = false;
2004                       goto out;
2005                     }
2006                   vec<tree> ops;
2007                   ops.create (group_size);
2008                   for (lane = 0; lane < group_size; ++lane)
2009                     ops.quick_push (chains[lane][n].op);
2010                   slp_tree child = vect_create_new_slp_node (ops);
2011                   SLP_TREE_DEF_TYPE (child) = dt;
2012                   children.safe_push (child);
2013                 }
2014               else if (dt != vect_internal_def)
2015                 {
2016                   /* Not sure, we might need sth special.
2017                      gcc.dg/vect/pr96854.c,
2018                      gfortran.dg/vect/fast-math-pr37021.f90
2019                      and gfortran.dg/vect/pr61171.f trigger.  */
2020                   /* Soft-fail for now.  */
2021                   hard_fail = false;
2022                   goto out;
2023                 }
2024               else
2025                 {
2026                   vec<stmt_vec_info> op_stmts;
2027                   op_stmts.create (group_size);
2028                   slp_tree child = NULL;
2029                   /* Brute-force our way.  We have to consider a lane
2030                      failing after fixing an earlier fail up in the
2031                      SLP discovery recursion.  So track the current
2032                      permute per lane.  */
2033                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2034                   memset (perms, 0, sizeof (unsigned) * group_size);
2035                   do
2036                     {
2037                       op_stmts.truncate (0);
2038                       for (lane = 0; lane < group_size; ++lane)
2039                         op_stmts.quick_push
2040                           (vinfo->lookup_def (chains[lane][n].op));
2041                       child = vect_build_slp_tree (vinfo, op_stmts,
2042                                                    group_size, &this_max_nunits,
2043                                                    matches, limit,
2044                                                    &this_tree_size, bst_map);
2045                       /* ???  We're likely getting too many fatal mismatches
2046                          here so maybe we want to ignore them (but then we
2047                          have no idea which lanes fatally mismatched).  */
2048                       if (child || !matches[0])
2049                         break;
2050                       /* Swap another lane we have not yet matched up into
2051                          lanes that did not match.  If we run out of
2052                          permute possibilities for a lane terminate the
2053                          search.  */
2054                       bool term = false;
2055                       for (lane = 1; lane < group_size; ++lane)
2056                         if (!matches[lane])
2057                           {
2058                             if (n + perms[lane] + 1 == chain_len)
2059                               {
2060                                 term = true;
2061                                 break;
2062                               }
2063                             std::swap (chains[lane][n],
2064                                        chains[lane][n + perms[lane] + 1]);
2065                             perms[lane]++;
2066                           }
2067                       if (term)
2068                         break;
2069                     }
2070                   while (1);
2071                   if (!child)
2072                     {
2073                       if (dump_enabled_p ())
2074                         dump_printf_loc (MSG_NOTE, vect_location,
2075                                          "failed to match up op %d\n", n);
2076                       op_stmts.release ();
2077                       if (lane != group_size - 1)
2078                         matches[0] = false;
2079                       else
2080                         matches[lane] = false;
2081                       goto out;
2082                     }
2083                   if (dump_enabled_p ())
2084                     {
2085                       dump_printf_loc (MSG_NOTE, vect_location,
2086                                        "matched up op %d to\n", n);
2087                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2088                     }
2089                   children.safe_push (child);
2090                 }
2091             }
2092           /* 3. build SLP nodes to combine the chain.  */
2093           for (unsigned lane = 0; lane < group_size; ++lane)
2094             if (chains[lane][0].code != code)
2095               {
2096                 /* See if there's any alternate all-PLUS entry.  */
2097                 unsigned n;
2098                 for (n = 1; n < chain_len; ++n)
2099                   {
2100                     for (lane = 0; lane < group_size; ++lane)
2101                       if (chains[lane][n].code != code)
2102                         break;
2103                     if (lane == group_size)
2104                       break;
2105                   }
2106                 if (n != chain_len)
2107                   {
2108                     /* Swap that in at first position.  */
2109                     std::swap (children[0], children[n]);
2110                     for (lane = 0; lane < group_size; ++lane)
2111                       std::swap (chains[lane][0], chains[lane][n]);
2112                   }
2113                 else
2114                   {
2115                     /* ???  When this triggers and we end up with two
2116                        vect_constant/external_def up-front things break (ICE)
2117                        spectacularly finding an insertion place for the
2118                        all-constant op.  We should have a fully
2119                        vect_internal_def operand though(?) so we can swap
2120                        that into first place and then prepend the all-zero
2121                        constant.  */
2122                     if (dump_enabled_p ())
2123                       dump_printf_loc (MSG_NOTE, vect_location,
2124                                        "inserting constant zero to compensate "
2125                                        "for (partially) negated first "
2126                                        "operand\n");
2127                     chain_len++;
2128                     for (lane = 0; lane < group_size; ++lane)
2129                       chains[lane].safe_insert
2130                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2131                     vec<tree> zero_ops;
2132                     zero_ops.create (group_size);
2133                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2134                     for (lane = 1; lane < group_size; ++lane)
2135                       zero_ops.quick_push (zero_ops[0]);
2136                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2137                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2138                     children.safe_insert (0, zero);
2139                   }
2140                 break;
2141               }
2142           for (unsigned i = 1; i < children.length (); ++i)
2143             {
2144               slp_tree op0 = children[i - 1];
2145               slp_tree op1 = children[i];
2146               bool this_two_op = false;
2147               for (unsigned lane = 0; lane < group_size; ++lane)
2148                 if (chains[lane][i].code != chains[0][i].code)
2149                   {
2150                     this_two_op = true;
2151                     break;
2152                   }
2153               slp_tree child;
2154               if (i == children.length () - 1)
2155                 child = vect_create_new_slp_node (node, stmts, 2);
2156               else
2157                 child = vect_create_new_slp_node (2, ERROR_MARK);
2158               if (this_two_op)
2159                 {
2160                   vec<std::pair<unsigned, unsigned> > lperm;
2161                   lperm.create (group_size);
2162                   for (unsigned lane = 0; lane < group_size; ++lane)
2163                     lperm.quick_push (std::make_pair
2164                       (chains[lane][i].code != chains[0][i].code, lane));
2165                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2166                                                      (chains[0][i].code == code
2167                                                       ? op_stmt_info
2168                                                       : other_op_stmt_info),
2169                                                      (chains[0][i].code == code
2170                                                       ? other_op_stmt_info
2171                                                       : op_stmt_info),
2172                                                      lperm);
2173                 }
2174               else
2175                 {
2176                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2177                   SLP_TREE_VECTYPE (child) = vectype;
2178                   SLP_TREE_LANES (child) = group_size;
2179                   SLP_TREE_CHILDREN (child).quick_push (op0);
2180                   SLP_TREE_CHILDREN (child).quick_push (op1);
2181                   SLP_TREE_REPRESENTATIVE (child)
2182                     = (chains[0][i].code == code
2183                        ? op_stmt_info : other_op_stmt_info);
2184                 }
2185               children[i] = child;
2186             }
2187           *tree_size += this_tree_size + 1;
2188           *max_nunits = this_max_nunits;
2189           while (!chains.is_empty ())
2190             chains.pop ().release ();
2191           return node;
2192         }
2193 out:
2194       while (!children.is_empty ())
2195         vect_free_slp_tree (children.pop ());
2196       while (!chains.is_empty ())
2197         chains.pop ().release ();
2198       /* Hard-fail, otherwise we might run into quadratic processing of the
2199          chains starting one stmt into the chain again.  */
2200       if (hard_fail)
2201         return NULL;
2202       /* Fall thru to normal processing.  */
2203     }
2204
2205   /* Get at the operands, verifying they are compatible.  */
2206   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2207   slp_oprnd_info oprnd_info;
2208   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2209     {
2210       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2211                                              stmts, i, &oprnds_info);
2212       if (res != 0)
2213         matches[(res == -1) ? 0 : i] = false;
2214       if (!matches[0])
2215         break;
2216     }
2217   for (i = 0; i < group_size; ++i)
2218     if (!matches[i])
2219       {
2220         vect_free_oprnd_info (oprnds_info);
2221         return NULL;
2222       }
2223   swap = NULL;
2224
2225   auto_vec<slp_tree, 4> children;
2226
2227   stmt_info = stmts[0];
2228
2229   /* Create SLP_TREE nodes for the definition node/s.  */
2230   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2231     {
2232       slp_tree child;
2233       unsigned int j;
2234
2235       /* We're skipping certain operands from processing, for example
2236          outer loop reduction initial defs.  */
2237       if (skip_args[i])
2238         {
2239           children.safe_push (NULL);
2240           continue;
2241         }
2242
2243       if (oprnd_info->first_dt == vect_uninitialized_def)
2244         {
2245           /* COND_EXPR have one too many eventually if the condition
2246              is a SSA name.  */
2247           gcc_assert (i == 3 && nops == 4);
2248           continue;
2249         }
2250
2251       if (is_a <bb_vec_info> (vinfo)
2252           && oprnd_info->first_dt == vect_internal_def
2253           && !oprnd_info->any_pattern)
2254         {
2255           /* For BB vectorization, if all defs are the same do not
2256              bother to continue the build along the single-lane
2257              graph but use a splat of the scalar value.  */
2258           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2259           for (j = 1; j < group_size; ++j)
2260             if (oprnd_info->def_stmts[j] != first_def)
2261               break;
2262           if (j == group_size
2263               /* But avoid doing this for loads where we may be
2264                  able to CSE things, unless the stmt is not
2265                  vectorizable.  */
2266               && (!STMT_VINFO_VECTORIZABLE (first_def)
2267                   || !gimple_vuse (first_def->stmt)))
2268             {
2269               if (dump_enabled_p ())
2270                 dump_printf_loc (MSG_NOTE, vect_location,
2271                                  "Using a splat of the uniform operand %G",
2272                                  first_def->stmt);
2273               oprnd_info->first_dt = vect_external_def;
2274             }
2275         }
2276
2277       if (oprnd_info->first_dt == vect_external_def
2278           || oprnd_info->first_dt == vect_constant_def)
2279         {
2280           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2281           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2282           oprnd_info->ops = vNULL;
2283           children.safe_push (invnode);
2284           continue;
2285         }
2286
2287       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2288                                         group_size, &this_max_nunits,
2289                                         matches, limit,
2290                                         &this_tree_size, bst_map)) != NULL)
2291         {
2292           oprnd_info->def_stmts = vNULL;
2293           children.safe_push (child);
2294           continue;
2295         }
2296
2297       /* If the SLP build for operand zero failed and operand zero
2298          and one can be commutated try that for the scalar stmts
2299          that failed the match.  */
2300       if (i == 0
2301           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2302           && matches[0]
2303           /* ???  For COND_EXPRs we can swap the comparison operands
2304              as well as the arms under some constraints.  */
2305           && nops == 2
2306           && oprnds_info[1]->first_dt == vect_internal_def
2307           && is_gimple_assign (stmt_info->stmt)
2308           /* Swapping operands for reductions breaks assumptions later on.  */
2309           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2310           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2311         {
2312           /* See whether we can swap the matching or the non-matching
2313              stmt operands.  */
2314           bool swap_not_matching = true;
2315           do
2316             {
2317               for (j = 0; j < group_size; ++j)
2318                 {
2319                   if (matches[j] != !swap_not_matching)
2320                     continue;
2321                   stmt_vec_info stmt_info = stmts[j];
2322                   /* Verify if we can swap operands of this stmt.  */
2323                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2324                   if (!stmt
2325                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2326                     {
2327                       if (!swap_not_matching)
2328                         goto fail;
2329                       swap_not_matching = false;
2330                       break;
2331                     }
2332                 }
2333             }
2334           while (j != group_size);
2335
2336           /* Swap mismatched definition stmts.  */
2337           if (dump_enabled_p ())
2338             dump_printf_loc (MSG_NOTE, vect_location,
2339                              "Re-trying with swapped operands of stmts ");
2340           for (j = 0; j < group_size; ++j)
2341             if (matches[j] == !swap_not_matching)
2342               {
2343                 std::swap (oprnds_info[0]->def_stmts[j],
2344                            oprnds_info[1]->def_stmts[j]);
2345                 std::swap (oprnds_info[0]->ops[j],
2346                            oprnds_info[1]->ops[j]);
2347                 if (dump_enabled_p ())
2348                   dump_printf (MSG_NOTE, "%d ", j);
2349               }
2350           if (dump_enabled_p ())
2351             dump_printf (MSG_NOTE, "\n");
2352           /* After swapping some operands we lost track whether an
2353              operand has any pattern defs so be conservative here.  */
2354           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2355             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2356           /* And try again with scratch 'matches' ... */
2357           bool *tem = XALLOCAVEC (bool, group_size);
2358           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2359                                             group_size, &this_max_nunits,
2360                                             tem, limit,
2361                                             &this_tree_size, bst_map)) != NULL)
2362             {
2363               oprnd_info->def_stmts = vNULL;
2364               children.safe_push (child);
2365               continue;
2366             }
2367         }
2368 fail:
2369
2370       /* If the SLP build failed and we analyze a basic-block
2371          simply treat nodes we fail to build as externally defined
2372          (and thus build vectors from the scalar defs).
2373          The cost model will reject outright expensive cases.
2374          ???  This doesn't treat cases where permutation ultimatively
2375          fails (or we don't try permutation below).  Ideally we'd
2376          even compute a permutation that will end up with the maximum
2377          SLP tree size...  */
2378       if (is_a <bb_vec_info> (vinfo)
2379           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2380              do extra work to cancel the pattern so the uses see the
2381              scalar version.  */
2382           && !is_pattern_stmt_p (stmt_info)
2383           && !oprnd_info->any_pattern)
2384         {
2385           /* But if there's a leading vector sized set of matching stmts
2386              fail here so we can split the group.  This matches the condition
2387              vect_analyze_slp_instance uses.  */
2388           /* ???  We might want to split here and combine the results to support
2389              multiple vector sizes better.  */
2390           for (j = 0; j < group_size; ++j)
2391             if (!matches[j])
2392               break;
2393           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2394             {
2395               if (dump_enabled_p ())
2396                 dump_printf_loc (MSG_NOTE, vect_location,
2397                                  "Building vector operands from scalars\n");
2398               this_tree_size++;
2399               child = vect_create_new_slp_node (oprnd_info->ops);
2400               children.safe_push (child);
2401               oprnd_info->ops = vNULL;
2402               continue;
2403             }
2404         }
2405
2406       gcc_assert (child == NULL);
2407       FOR_EACH_VEC_ELT (children, j, child)
2408         if (child)
2409           vect_free_slp_tree (child);
2410       vect_free_oprnd_info (oprnds_info);
2411       return NULL;
2412     }
2413
2414   vect_free_oprnd_info (oprnds_info);
2415
2416   /* If we have all children of a child built up from uniform scalars
2417      or does more than one possibly expensive vector construction then
2418      just throw that away, causing it built up from scalars.
2419      The exception is the SLP node for the vector store.  */
2420   if (is_a <bb_vec_info> (vinfo)
2421       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2422       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2423          do extra work to cancel the pattern so the uses see the
2424          scalar version.  */
2425       && !is_pattern_stmt_p (stmt_info))
2426     {
2427       slp_tree child;
2428       unsigned j;
2429       bool all_uniform_p = true;
2430       unsigned n_vector_builds = 0;
2431       FOR_EACH_VEC_ELT (children, j, child)
2432         {
2433           if (!child)
2434             ;
2435           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2436             all_uniform_p = false;
2437           else if (!vect_slp_tree_uniform_p (child))
2438             {
2439               all_uniform_p = false;
2440               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2441                 n_vector_builds++;
2442             }
2443         }
2444       if (all_uniform_p
2445           || n_vector_builds > 1
2446           || (n_vector_builds == children.length ()
2447               && is_a <gphi *> (stmt_info->stmt)))
2448         {
2449           /* Roll back.  */
2450           matches[0] = false;
2451           FOR_EACH_VEC_ELT (children, j, child)
2452             if (child)
2453               vect_free_slp_tree (child);
2454
2455           if (dump_enabled_p ())
2456             dump_printf_loc (MSG_NOTE, vect_location,
2457                              "Building parent vector operands from "
2458                              "scalars instead\n");
2459           return NULL;
2460         }
2461     }
2462
2463   *tree_size += this_tree_size + 1;
2464   *max_nunits = this_max_nunits;
2465
2466   if (two_operators)
2467     {
2468       /* ???  We'd likely want to either cache in bst_map sth like
2469          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2470          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2471          explicit stmts to put in so the keying on 'stmts' doesn't
2472          work (but we have the same issue with nodes that use 'ops').  */
2473       slp_tree one = new _slp_tree;
2474       slp_tree two = new _slp_tree;
2475       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2476       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2477       SLP_TREE_VECTYPE (one) = vectype;
2478       SLP_TREE_VECTYPE (two) = vectype;
2479       SLP_TREE_CHILDREN (one).safe_splice (children);
2480       SLP_TREE_CHILDREN (two).safe_splice (children);
2481       slp_tree child;
2482       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2483         SLP_TREE_REF_COUNT (child)++;
2484
2485       /* Here we record the original defs since this
2486          node represents the final lane configuration.  */
2487       node = vect_create_new_slp_node (node, stmts, 2);
2488       SLP_TREE_VECTYPE (node) = vectype;
2489       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2490       SLP_TREE_CHILDREN (node).quick_push (one);
2491       SLP_TREE_CHILDREN (node).quick_push (two);
2492       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2493       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2494       enum tree_code ocode = ERROR_MARK;
2495       stmt_vec_info ostmt_info;
2496       unsigned j = 0;
2497       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2498         {
2499           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2500           if (gimple_assign_rhs_code (ostmt) != code0)
2501             {
2502               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2503               ocode = gimple_assign_rhs_code (ostmt);
2504               j = i;
2505             }
2506           else
2507             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2508         }
2509       SLP_TREE_CODE (one) = code0;
2510       SLP_TREE_CODE (two) = ocode;
2511       SLP_TREE_LANES (one) = stmts.length ();
2512       SLP_TREE_LANES (two) = stmts.length ();
2513       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2514       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2515       return node;
2516     }
2517
2518   node = vect_create_new_slp_node (node, stmts, nops);
2519   SLP_TREE_VECTYPE (node) = vectype;
2520   SLP_TREE_CHILDREN (node).splice (children);
2521   return node;
2522 }
2523
2524 /* Dump a single SLP tree NODE.  */
2525
2526 static void
2527 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2528                      slp_tree node)
2529 {
2530   unsigned i, j;
2531   slp_tree child;
2532   stmt_vec_info stmt_info;
2533   tree op;
2534
2535   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2536   dump_user_location_t user_loc = loc.get_user_location ();
2537   dump_printf_loc (metadata, user_loc,
2538                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2539                    ", refcnt=%u)",
2540                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2541                    ? " (external)"
2542                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2543                       ? " (constant)"
2544                       : ""), (void *) node,
2545                    estimated_poly_value (node->max_nunits),
2546                                          SLP_TREE_REF_COUNT (node));
2547   if (SLP_TREE_VECTYPE (node))
2548     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2549   dump_printf (metadata, "\n");
2550   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2551     {
2552       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2553         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2554       else
2555         dump_printf_loc (metadata, user_loc, "op template: %G",
2556                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2557     }
2558   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2559     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2560       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2561   else
2562     {
2563       dump_printf_loc (metadata, user_loc, "\t{ ");
2564       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2565         dump_printf (metadata, "%T%s ", op,
2566                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2567       dump_printf (metadata, "}\n");
2568     }
2569   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2570     {
2571       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2572       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2573         dump_printf (dump_kind, " %u", j);
2574       dump_printf (dump_kind, " }\n");
2575     }
2576   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2577     {
2578       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2579       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2580         dump_printf (dump_kind, " %u[%u]",
2581                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2582                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2583       dump_printf (dump_kind, " }\n");
2584     }
2585   if (SLP_TREE_CHILDREN (node).is_empty ())
2586     return;
2587   dump_printf_loc (metadata, user_loc, "\tchildren");
2588   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2589     dump_printf (dump_kind, " %p", (void *)child);
2590   dump_printf (dump_kind, "\n");
2591 }
2592
2593 DEBUG_FUNCTION void
2594 debug (slp_tree node)
2595 {
2596   debug_dump_context ctx;
2597   vect_print_slp_tree (MSG_NOTE,
2598                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2599                        node);
2600 }
2601
2602 /* Recursive helper for the dot producer below.  */
2603
2604 static void
2605 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2606 {
2607   if (visited.add (node))
2608     return;
2609
2610   fprintf (f, "\"%p\" [label=\"", (void *)node);
2611   vect_print_slp_tree (MSG_NOTE,
2612                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2613                        node);
2614   fprintf (f, "\"];\n");
2615
2616
2617   for (slp_tree child : SLP_TREE_CHILDREN (node))
2618     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2619
2620   for (slp_tree child : SLP_TREE_CHILDREN (node))
2621     if (child)
2622       dot_slp_tree (f, child, visited);
2623 }
2624
2625 DEBUG_FUNCTION void
2626 dot_slp_tree (const char *fname, slp_tree node)
2627 {
2628   FILE *f = fopen (fname, "w");
2629   fprintf (f, "digraph {\n");
2630   fflush (f);
2631     {
2632       debug_dump_context ctx (f);
2633       hash_set<slp_tree> visited;
2634       dot_slp_tree (f, node, visited);
2635     }
2636   fflush (f);
2637   fprintf (f, "}\n");
2638   fclose (f);
2639 }
2640
2641 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2642
2643 static void
2644 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2645                       slp_tree node, hash_set<slp_tree> &visited)
2646 {
2647   unsigned i;
2648   slp_tree child;
2649
2650   if (visited.add (node))
2651     return;
2652
2653   vect_print_slp_tree (dump_kind, loc, node);
2654
2655   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2656     if (child)
2657       vect_print_slp_graph (dump_kind, loc, child, visited);
2658 }
2659
2660 static void
2661 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2662                       slp_tree entry)
2663 {
2664   hash_set<slp_tree> visited;
2665   vect_print_slp_graph (dump_kind, loc, entry, visited);
2666 }
2667
2668 /* Mark the tree rooted at NODE with PURE_SLP.  */
2669
2670 static void
2671 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2672 {
2673   int i;
2674   stmt_vec_info stmt_info;
2675   slp_tree child;
2676
2677   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2678     return;
2679
2680   if (visited.add (node))
2681     return;
2682
2683   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2684     STMT_SLP_TYPE (stmt_info) = pure_slp;
2685
2686   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2687     if (child)
2688       vect_mark_slp_stmts (child, visited);
2689 }
2690
2691 static void
2692 vect_mark_slp_stmts (slp_tree node)
2693 {
2694   hash_set<slp_tree> visited;
2695   vect_mark_slp_stmts (node, visited);
2696 }
2697
2698 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2699
2700 static void
2701 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2702 {
2703   int i;
2704   stmt_vec_info stmt_info;
2705   slp_tree child;
2706
2707   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2708     return;
2709
2710   if (visited.add (node))
2711     return;
2712
2713   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2714     {
2715       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2716                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2717       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2718     }
2719
2720   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2721     if (child)
2722       vect_mark_slp_stmts_relevant (child, visited);
2723 }
2724
2725 static void
2726 vect_mark_slp_stmts_relevant (slp_tree node)
2727 {
2728   hash_set<slp_tree> visited;
2729   vect_mark_slp_stmts_relevant (node, visited);
2730 }
2731
2732
2733 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2734
2735 static void
2736 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2737                        hash_set<slp_tree> &visited)
2738 {
2739   if (!node || visited.add (node))
2740     return;
2741
2742   if (SLP_TREE_CHILDREN (node).length () == 0)
2743     {
2744       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2745         return;
2746       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2747       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2748           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2749         loads.safe_push (node);
2750     }
2751   else
2752     {
2753       unsigned i;
2754       slp_tree child;
2755       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2756         vect_gather_slp_loads (loads, child, visited);
2757     }
2758 }
2759
2760
2761 /* Find the last store in SLP INSTANCE.  */
2762
2763 stmt_vec_info
2764 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2765 {
2766   stmt_vec_info last = NULL;
2767   stmt_vec_info stmt_vinfo;
2768
2769   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2770     {
2771       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2772       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2773     }
2774
2775   return last;
2776 }
2777
2778 /* Find the first stmt in NODE.  */
2779
2780 stmt_vec_info
2781 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2782 {
2783   stmt_vec_info first = NULL;
2784   stmt_vec_info stmt_vinfo;
2785
2786   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2787     {
2788       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2789       if (!first
2790           || get_later_stmt (stmt_vinfo, first) == first)
2791         first = stmt_vinfo;
2792     }
2793
2794   return first;
2795 }
2796
2797 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2798    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2799    (also containing the first GROUP1_SIZE stmts, since stores are
2800    consecutive), the second containing the remainder.
2801    Return the first stmt in the second group.  */
2802
2803 static stmt_vec_info
2804 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2805 {
2806   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2807   gcc_assert (group1_size > 0);
2808   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2809   gcc_assert (group2_size > 0);
2810   DR_GROUP_SIZE (first_vinfo) = group1_size;
2811
2812   stmt_vec_info stmt_info = first_vinfo;
2813   for (unsigned i = group1_size; i > 1; i--)
2814     {
2815       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2817     }
2818   /* STMT is now the last element of the first group.  */
2819   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2820   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2821
2822   DR_GROUP_SIZE (group2) = group2_size;
2823   for (stmt_info = group2; stmt_info;
2824        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2825     {
2826       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2827       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2828     }
2829
2830   /* For the second group, the DR_GROUP_GAP is that before the original group,
2831      plus skipping over the first vector.  */
2832   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2833
2834   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2835   DR_GROUP_GAP (first_vinfo) += group2_size;
2836
2837   if (dump_enabled_p ())
2838     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2839                      group1_size, group2_size);
2840
2841   return group2;
2842 }
2843
2844 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2845    statements and a vector of NUNITS elements.  */
2846
2847 static poly_uint64
2848 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2849 {
2850   return exact_div (common_multiple (nunits, group_size), group_size);
2851 }
2852
2853 /* Helper that checks to see if a node is a load node.  */
2854
2855 static inline bool
2856 vect_is_slp_load_node  (slp_tree root)
2857 {
2858   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2859          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2860          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2861 }
2862
2863
2864 /* Helper function of optimize_load_redistribution that performs the operation
2865    recursively.  */
2866
2867 static slp_tree
2868 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2869                                 vec_info *vinfo, unsigned int group_size,
2870                                 hash_map<slp_tree, slp_tree> *load_map,
2871                                 slp_tree root)
2872 {
2873   if (slp_tree *leader = load_map->get (root))
2874     return *leader;
2875
2876   slp_tree node;
2877   unsigned i;
2878
2879   /* For now, we don't know anything about externals so do not do anything.  */
2880   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2881     return NULL;
2882   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2883     {
2884       /* First convert this node into a load node and add it to the leaves
2885          list and flatten the permute from a lane to a load one.  If it's
2886          unneeded it will be elided later.  */
2887       vec<stmt_vec_info> stmts;
2888       stmts.create (SLP_TREE_LANES (root));
2889       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2890       for (unsigned j = 0; j < lane_perm.length (); j++)
2891         {
2892           std::pair<unsigned, unsigned> perm = lane_perm[j];
2893           node = SLP_TREE_CHILDREN (root)[perm.first];
2894
2895           if (!vect_is_slp_load_node (node)
2896               || SLP_TREE_CHILDREN (node).exists ())
2897             {
2898               stmts.release ();
2899               goto next;
2900             }
2901
2902           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2903         }
2904
2905       if (dump_enabled_p ())
2906         dump_printf_loc (MSG_NOTE, vect_location,
2907                          "converting stmts on permute node %p\n",
2908                          (void *) root);
2909
2910       bool *matches = XALLOCAVEC (bool, group_size);
2911       poly_uint64 max_nunits = 1;
2912       unsigned tree_size = 0, limit = 1;
2913       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2914                                   matches, &limit, &tree_size, bst_map);
2915       if (!node)
2916         stmts.release ();
2917
2918       load_map->put (root, node);
2919       return node;
2920     }
2921
2922 next:
2923   load_map->put (root, NULL);
2924
2925   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2926     {
2927       slp_tree value
2928         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2929                                           node);
2930       if (value)
2931         {
2932           SLP_TREE_REF_COUNT (value)++;
2933           SLP_TREE_CHILDREN (root)[i] = value;
2934           /* ???  We know the original leafs of the replaced nodes will
2935              be referenced by bst_map, only the permutes created by
2936              pattern matching are not.  */
2937           if (SLP_TREE_REF_COUNT (node) == 1)
2938             load_map->remove (node);
2939           vect_free_slp_tree (node);
2940         }
2941     }
2942
2943   return NULL;
2944 }
2945
2946 /* Temporary workaround for loads not being CSEd during SLP build.  This
2947    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2948    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2949    same DR such that the final operation is equal to a permuted load.  Such
2950    NODES are then directly converted into LOADS themselves.  The nodes are
2951    CSEd using BST_MAP.  */
2952
2953 static void
2954 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2955                               vec_info *vinfo, unsigned int group_size,
2956                               hash_map<slp_tree, slp_tree> *load_map,
2957                               slp_tree root)
2958 {
2959   slp_tree node;
2960   unsigned i;
2961
2962   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2963     {
2964       slp_tree value
2965         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2966                                           node);
2967       if (value)
2968         {
2969           SLP_TREE_REF_COUNT (value)++;
2970           SLP_TREE_CHILDREN (root)[i] = value;
2971           /* ???  We know the original leafs of the replaced nodes will
2972              be referenced by bst_map, only the permutes created by
2973              pattern matching are not.  */
2974           if (SLP_TREE_REF_COUNT (node) == 1)
2975             load_map->remove (node);
2976           vect_free_slp_tree (node);
2977         }
2978     }
2979 }
2980
2981 /* Helper function of vect_match_slp_patterns.
2982
2983    Attempts to match patterns against the slp tree rooted in REF_NODE using
2984    VINFO.  Patterns are matched in post-order traversal.
2985
2986    If matching is successful the value in REF_NODE is updated and returned, if
2987    not then it is returned unchanged.  */
2988
2989 static bool
2990 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2991                            slp_tree_to_load_perm_map_t *perm_cache,
2992                            slp_compat_nodes_map_t *compat_cache,
2993                            hash_set<slp_tree> *visited)
2994 {
2995   unsigned i;
2996   slp_tree node = *ref_node;
2997   bool found_p = false;
2998   if (!node || visited->add (node))
2999     return false;
3000
3001   slp_tree child;
3002   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3003     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3004                                           vinfo, perm_cache, compat_cache,
3005                                           visited);
3006
3007   for (unsigned x = 0; x < num__slp_patterns; x++)
3008     {
3009       vect_pattern *pattern
3010         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3011       if (pattern)
3012         {
3013           pattern->build (vinfo);
3014           delete pattern;
3015           found_p = true;
3016         }
3017     }
3018
3019   return found_p;
3020 }
3021
3022 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3023    vec_info VINFO.
3024
3025    The modified tree is returned.  Patterns are tried in order and multiple
3026    patterns may match.  */
3027
3028 static bool
3029 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3030                          hash_set<slp_tree> *visited,
3031                          slp_tree_to_load_perm_map_t *perm_cache,
3032                          slp_compat_nodes_map_t *compat_cache)
3033 {
3034   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3035   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3036
3037   if (dump_enabled_p ())
3038     dump_printf_loc (MSG_NOTE, vect_location,
3039                      "Analyzing SLP tree %p for patterns\n",
3040                      (void *) SLP_INSTANCE_TREE (instance));
3041
3042   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3043                                     visited);
3044 }
3045
3046 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3047    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3048    Return true if we could use IFN_STORE_LANES instead and if that appears
3049    to be the better approach.  */
3050
3051 static bool
3052 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3053                                unsigned int group_size,
3054                                unsigned int new_group_size)
3055 {
3056   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3057   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3058   if (!vectype)
3059     return false;
3060   /* Allow the split if one of the two new groups would operate on full
3061      vectors *within* rather than across one scalar loop iteration.
3062      This is purely a heuristic, but it should work well for group
3063      sizes of 3 and 4, where the possible splits are:
3064
3065        3->2+1:  OK if the vector has exactly two elements
3066        4->2+2:  Likewise
3067        4->3+1:  Less clear-cut.  */
3068   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3069       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3070     return false;
3071   return vect_store_lanes_supported (vectype, group_size, false);
3072 }
3073
3074 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3075    vect_build_slp_tree to build a tree of packed stmts if possible.
3076    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3077
3078 static bool
3079 vect_analyze_slp_instance (vec_info *vinfo,
3080                            scalar_stmts_to_slp_tree_map_t *bst_map,
3081                            stmt_vec_info stmt_info, slp_instance_kind kind,
3082                            unsigned max_tree_size, unsigned *limit);
3083
3084 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3085    of KIND.  Return true if successful.  */
3086
3087 static bool
3088 vect_build_slp_instance (vec_info *vinfo,
3089                          slp_instance_kind kind,
3090                          vec<stmt_vec_info> &scalar_stmts,
3091                          vec<stmt_vec_info> &root_stmt_infos,
3092                          unsigned max_tree_size, unsigned *limit,
3093                          scalar_stmts_to_slp_tree_map_t *bst_map,
3094                          /* ???  We need stmt_info for group splitting.  */
3095                          stmt_vec_info stmt_info_)
3096 {
3097   if (dump_enabled_p ())
3098     {
3099       dump_printf_loc (MSG_NOTE, vect_location,
3100                        "Starting SLP discovery for\n");
3101       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3102         dump_printf_loc (MSG_NOTE, vect_location,
3103                          "  %G", scalar_stmts[i]->stmt);
3104     }
3105
3106   /* Build the tree for the SLP instance.  */
3107   unsigned int group_size = scalar_stmts.length ();
3108   bool *matches = XALLOCAVEC (bool, group_size);
3109   poly_uint64 max_nunits = 1;
3110   unsigned tree_size = 0;
3111   unsigned i;
3112   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3113                                        &max_nunits, matches, limit,
3114                                        &tree_size, bst_map);
3115   if (node != NULL)
3116     {
3117       /* Calculate the unrolling factor based on the smallest type.  */
3118       poly_uint64 unrolling_factor
3119         = calculate_unrolling_factor (max_nunits, group_size);
3120
3121       if (maybe_ne (unrolling_factor, 1U)
3122           && is_a <bb_vec_info> (vinfo))
3123         {
3124           unsigned HOST_WIDE_INT const_max_nunits;
3125           if (!max_nunits.is_constant (&const_max_nunits)
3126               || const_max_nunits > group_size)
3127             {
3128               if (dump_enabled_p ())
3129                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130                                  "Build SLP failed: store group "
3131                                  "size not a multiple of the vector size "
3132                                  "in basic block SLP\n");
3133               vect_free_slp_tree (node);
3134               return false;
3135             }
3136           /* Fatal mismatch.  */
3137           if (dump_enabled_p ())
3138             dump_printf_loc (MSG_NOTE, vect_location,
3139                              "SLP discovery succeeded but node needs "
3140                              "splitting\n");
3141           memset (matches, true, group_size);
3142           matches[group_size / const_max_nunits * const_max_nunits] = false;
3143           vect_free_slp_tree (node);
3144         }
3145       else
3146         {
3147           /* Create a new SLP instance.  */
3148           slp_instance new_instance = XNEW (class _slp_instance);
3149           SLP_INSTANCE_TREE (new_instance) = node;
3150           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3151           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3152           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3153           SLP_INSTANCE_KIND (new_instance) = kind;
3154           new_instance->reduc_phis = NULL;
3155           new_instance->cost_vec = vNULL;
3156           new_instance->subgraph_entries = vNULL;
3157
3158           if (dump_enabled_p ())
3159             dump_printf_loc (MSG_NOTE, vect_location,
3160                              "SLP size %u vs. limit %u.\n",
3161                              tree_size, max_tree_size);
3162
3163           /* Fixup SLP reduction chains.  */
3164           if (kind == slp_inst_kind_reduc_chain)
3165             {
3166               /* If this is a reduction chain with a conversion in front
3167                  amend the SLP tree with a node for that.  */
3168               gimple *scalar_def
3169                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3170               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3171                 {
3172                   /* Get at the conversion stmt - we know it's the single use
3173                      of the last stmt of the reduction chain.  */
3174                   use_operand_p use_p;
3175                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3176                                            &use_p, &scalar_def);
3177                   gcc_assert (r);
3178                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3179                   next_info = vect_stmt_to_vectorize (next_info);
3180                   scalar_stmts = vNULL;
3181                   scalar_stmts.create (group_size);
3182                   for (unsigned i = 0; i < group_size; ++i)
3183                     scalar_stmts.quick_push (next_info);
3184                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3185                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3186                   SLP_TREE_CHILDREN (conv).quick_push (node);
3187                   SLP_INSTANCE_TREE (new_instance) = conv;
3188                   /* We also have to fake this conversion stmt as SLP reduction
3189                      group so we don't have to mess with too much code
3190                      elsewhere.  */
3191                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3192                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3193                 }
3194               /* Fill the backedge child of the PHI SLP node.  The
3195                  general matching code cannot find it because the
3196                  scalar code does not reflect how we vectorize the
3197                  reduction.  */
3198               use_operand_p use_p;
3199               imm_use_iterator imm_iter;
3200               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3201               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3202                                      gimple_get_lhs (scalar_def))
3203                 /* There are exactly two non-debug uses, the reduction
3204                    PHI and the loop-closed PHI node.  */
3205                 if (!is_gimple_debug (USE_STMT (use_p))
3206                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3207                   {
3208                     auto_vec<stmt_vec_info, 64> phis (group_size);
3209                     stmt_vec_info phi_info
3210                       = vinfo->lookup_stmt (USE_STMT (use_p));
3211                     for (unsigned i = 0; i < group_size; ++i)
3212                       phis.quick_push (phi_info);
3213                     slp_tree *phi_node = bst_map->get (phis);
3214                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3215                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3216                       = SLP_INSTANCE_TREE (new_instance);
3217                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3218                   }
3219             }
3220
3221           vinfo->slp_instances.safe_push (new_instance);
3222
3223           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3224              the number of scalar stmts in the root in a few places.
3225              Verify that assumption holds.  */
3226           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3227                         .length () == group_size);
3228
3229           if (dump_enabled_p ())
3230             {
3231               dump_printf_loc (MSG_NOTE, vect_location,
3232                                "Final SLP tree for instance %p:\n",
3233                                (void *) new_instance);
3234               vect_print_slp_graph (MSG_NOTE, vect_location,
3235                                     SLP_INSTANCE_TREE (new_instance));
3236             }
3237
3238           return true;
3239         }
3240     }
3241   else
3242     {
3243       /* Failed to SLP.  */
3244       /* Free the allocated memory.  */
3245       scalar_stmts.release ();
3246     }
3247
3248   stmt_vec_info stmt_info = stmt_info_;
3249   /* Try to break the group up into pieces.  */
3250   if (kind == slp_inst_kind_store)
3251     {
3252       /* ???  We could delay all the actual splitting of store-groups
3253          until after SLP discovery of the original group completed.
3254          Then we can recurse to vect_build_slp_instance directly.  */
3255       for (i = 0; i < group_size; i++)
3256         if (!matches[i])
3257           break;
3258
3259       /* For basic block SLP, try to break the group up into multiples of
3260          a vector size.  */
3261       if (is_a <bb_vec_info> (vinfo)
3262           && (i > 1 && i < group_size))
3263         {
3264           tree scalar_type
3265             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3266           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3267                                                       1 << floor_log2 (i));
3268           unsigned HOST_WIDE_INT const_nunits;
3269           if (vectype
3270               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3271             {
3272               /* Split into two groups at the first vector boundary.  */
3273               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3274               unsigned group1_size = i & ~(const_nunits - 1);
3275
3276               if (dump_enabled_p ())
3277                 dump_printf_loc (MSG_NOTE, vect_location,
3278                                  "Splitting SLP group at stmt %u\n", i);
3279               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3280                                                                group1_size);
3281               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3282                                                     kind, max_tree_size,
3283                                                     limit);
3284               /* Split the rest at the failure point and possibly
3285                  re-analyze the remaining matching part if it has
3286                  at least two lanes.  */
3287               if (group1_size < i
3288                   && (i + 1 < group_size
3289                       || i - group1_size > 1))
3290                 {
3291                   stmt_vec_info rest2 = rest;
3292                   rest = vect_split_slp_store_group (rest, i - group1_size);
3293                   if (i - group1_size > 1)
3294                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3295                                                       kind, max_tree_size,
3296                                                       limit);
3297                 }
3298               /* Re-analyze the non-matching tail if it has at least
3299                  two lanes.  */
3300               if (i + 1 < group_size)
3301                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3302                                                   rest, kind, max_tree_size,
3303                                                   limit);
3304               return res;
3305             }
3306         }
3307
3308       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3309       if (is_a <loop_vec_info> (vinfo)
3310           && (i > 1 && i < group_size)
3311           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3312         {
3313           unsigned group1_size = i;
3314
3315           if (dump_enabled_p ())
3316             dump_printf_loc (MSG_NOTE, vect_location,
3317                              "Splitting SLP group at stmt %u\n", i);
3318
3319           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3320                                                            group1_size);
3321           /* Loop vectorization cannot handle gaps in stores, make sure
3322              the split group appears as strided.  */
3323           STMT_VINFO_STRIDED_P (rest) = 1;
3324           DR_GROUP_GAP (rest) = 0;
3325           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3326           DR_GROUP_GAP (stmt_info) = 0;
3327
3328           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3329                                                 kind, max_tree_size, limit);
3330           if (i + 1 < group_size)
3331             res |= vect_analyze_slp_instance (vinfo, bst_map,
3332                                               rest, kind, max_tree_size, limit);
3333
3334           return res;
3335         }
3336
3337       /* Even though the first vector did not all match, we might be able to SLP
3338          (some) of the remainder.  FORNOW ignore this possibility.  */
3339     }
3340
3341   /* Failed to SLP.  */
3342   if (dump_enabled_p ())
3343     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3344   return false;
3345 }
3346
3347
3348 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3349    vect_build_slp_tree to build a tree of packed stmts if possible.
3350    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3351
3352 static bool
3353 vect_analyze_slp_instance (vec_info *vinfo,
3354                            scalar_stmts_to_slp_tree_map_t *bst_map,
3355                            stmt_vec_info stmt_info,
3356                            slp_instance_kind kind,
3357                            unsigned max_tree_size, unsigned *limit)
3358 {
3359   unsigned int i;
3360   vec<stmt_vec_info> scalar_stmts;
3361
3362   if (is_a <bb_vec_info> (vinfo))
3363     vect_location = stmt_info->stmt;
3364
3365   stmt_vec_info next_info = stmt_info;
3366   if (kind == slp_inst_kind_store)
3367     {
3368       /* Collect the stores and store them in scalar_stmts.  */
3369       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3370       while (next_info)
3371         {
3372           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3373           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3374         }
3375     }
3376   else if (kind == slp_inst_kind_reduc_chain)
3377     {
3378       /* Collect the reduction stmts and store them in scalar_stmts.  */
3379       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3380       while (next_info)
3381         {
3382           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3383           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3384         }
3385       /* Mark the first element of the reduction chain as reduction to properly
3386          transform the node.  In the reduction analysis phase only the last
3387          element of the chain is marked as reduction.  */
3388       STMT_VINFO_DEF_TYPE (stmt_info)
3389         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3390       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3391         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3392     }
3393   else if (kind == slp_inst_kind_ctor)
3394     {
3395       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3396       tree val;
3397       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3398       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3399         {
3400           stmt_vec_info def_info = vinfo->lookup_def (val);
3401           def_info = vect_stmt_to_vectorize (def_info);
3402           scalar_stmts.quick_push (def_info);
3403         }
3404       if (dump_enabled_p ())
3405         dump_printf_loc (MSG_NOTE, vect_location,
3406                          "Analyzing vectorizable constructor: %G\n",
3407                          stmt_info->stmt);
3408     }
3409   else if (kind == slp_inst_kind_reduc_group)
3410     {
3411       /* Collect reduction statements.  */
3412       const vec<stmt_vec_info> &reductions
3413         = as_a <loop_vec_info> (vinfo)->reductions;
3414       scalar_stmts.create (reductions.length ());
3415       for (i = 0; reductions.iterate (i, &next_info); i++)
3416         if ((STMT_VINFO_RELEVANT_P (next_info)
3417              || STMT_VINFO_LIVE_P (next_info))
3418             /* ???  Make sure we didn't skip a conversion around a reduction
3419                path.  In that case we'd have to reverse engineer that conversion
3420                stmt following the chain using reduc_idx and from the PHI
3421                using reduc_def.  */
3422             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3423           scalar_stmts.quick_push (next_info);
3424       /* If less than two were relevant/live there's nothing to SLP.  */
3425       if (scalar_stmts.length () < 2)
3426         return false;
3427     }
3428   else
3429     gcc_unreachable ();
3430
3431   vec<stmt_vec_info> roots = vNULL;
3432   if (kind == slp_inst_kind_ctor)
3433     {
3434       roots.create (1);
3435       roots.quick_push (stmt_info);
3436     }
3437   /* Build the tree for the SLP instance.  */
3438   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3439                                       roots,
3440                                       max_tree_size, limit, bst_map,
3441                                       kind == slp_inst_kind_store
3442                                       ? stmt_info : NULL);
3443   if (!res)
3444     roots.release ();
3445
3446   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3447      where we should do store group splitting.  */
3448
3449   return res;
3450 }
3451
3452 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3453    trees of packed scalar stmts if SLP is possible.  */
3454
3455 opt_result
3456 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3457 {
3458   unsigned int i;
3459   stmt_vec_info first_element;
3460   slp_instance instance;
3461
3462   DUMP_VECT_SCOPE ("vect_analyze_slp");
3463
3464   unsigned limit = max_tree_size;
3465
3466   scalar_stmts_to_slp_tree_map_t *bst_map
3467     = new scalar_stmts_to_slp_tree_map_t ();
3468
3469   /* Find SLP sequences starting from groups of grouped stores.  */
3470   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3471     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3472                                STMT_VINFO_GROUPED_ACCESS (first_element)
3473                                ? slp_inst_kind_store : slp_inst_kind_ctor,
3474                                max_tree_size, &limit);
3475
3476   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3477     {
3478       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3479         {
3480           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3481           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3482                                        bb_vinfo->roots[i].stmts,
3483                                        bb_vinfo->roots[i].roots,
3484                                        max_tree_size, &limit, bst_map, NULL))
3485             {
3486               bb_vinfo->roots[i].stmts = vNULL;
3487               bb_vinfo->roots[i].roots = vNULL;
3488             }
3489         }
3490     }
3491
3492   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3493     {
3494       /* Find SLP sequences starting from reduction chains.  */
3495       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3496         if (! STMT_VINFO_RELEVANT_P (first_element)
3497             && ! STMT_VINFO_LIVE_P (first_element))
3498           ;
3499         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3500                                               slp_inst_kind_reduc_chain,
3501                                               max_tree_size, &limit))
3502           {
3503             /* Dissolve reduction chain group.  */
3504             stmt_vec_info vinfo = first_element;
3505             stmt_vec_info last = NULL;
3506             while (vinfo)
3507               {
3508                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3509                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3510                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3511                 last = vinfo;
3512                 vinfo = next;
3513               }
3514             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3515             /* It can be still vectorized as part of an SLP reduction.  */
3516             loop_vinfo->reductions.safe_push (last);
3517           }
3518
3519       /* Find SLP sequences starting from groups of reductions.  */
3520       if (loop_vinfo->reductions.length () > 1)
3521         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3522                                    slp_inst_kind_reduc_group, max_tree_size,
3523                                    &limit);
3524     }
3525
3526   hash_set<slp_tree> visited_patterns;
3527   slp_tree_to_load_perm_map_t perm_cache;
3528   slp_compat_nodes_map_t compat_cache;
3529
3530   /* See if any patterns can be found in the SLP tree.  */
3531   bool pattern_found = false;
3532   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3533     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3534                                               &visited_patterns, &perm_cache,
3535                                               &compat_cache);
3536
3537   /* If any were found optimize permutations of loads.  */
3538   if (pattern_found)
3539     {
3540       hash_map<slp_tree, slp_tree> load_map;
3541       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3542         {
3543           slp_tree root = SLP_INSTANCE_TREE (instance);
3544           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3545                                         &load_map, root);
3546         }
3547     }
3548
3549
3550
3551   /* The map keeps a reference on SLP nodes built, release that.  */
3552   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3553        it != bst_map->end (); ++it)
3554     if ((*it).second)
3555       vect_free_slp_tree ((*it).second);
3556   delete bst_map;
3557
3558   if (pattern_found && dump_enabled_p ())
3559     {
3560       dump_printf_loc (MSG_NOTE, vect_location,
3561                        "Pattern matched SLP tree\n");
3562       hash_set<slp_tree> visited;
3563       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3564         vect_print_slp_graph (MSG_NOTE, vect_location,
3565                               SLP_INSTANCE_TREE (instance), visited);
3566     }
3567
3568   return opt_result::success ();
3569 }
3570
3571 /* Estimates the cost of inserting layout changes into the SLP graph.
3572    It can also say that the insertion is impossible.  */
3573
3574 struct slpg_layout_cost
3575 {
3576   slpg_layout_cost () = default;
3577   slpg_layout_cost (sreal, bool);
3578
3579   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3580   bool is_possible () const { return depth != sreal::max (); }
3581
3582   bool operator== (const slpg_layout_cost &) const;
3583   bool operator!= (const slpg_layout_cost &) const;
3584
3585   bool is_better_than (const slpg_layout_cost &, bool) const;
3586
3587   void add_parallel_cost (const slpg_layout_cost &);
3588   void add_serial_cost (const slpg_layout_cost &);
3589   void split (unsigned int);
3590
3591   /* The longest sequence of layout changes needed during any traversal
3592      of the partition dag, weighted by execution frequency.
3593
3594      This is the most important metric when optimizing for speed, since
3595      it helps to ensure that we keep the number of operations on
3596      critical paths to a minimum.  */
3597   sreal depth = 0;
3598
3599   /* An estimate of the total number of operations needed.  It is weighted by
3600      execution frequency when optimizing for speed but not when optimizing for
3601      size.  In order to avoid double-counting, a node with a fanout of N will
3602      distribute 1/N of its total cost to each successor.
3603
3604      This is the most important metric when optimizing for size, since
3605      it helps to keep the total number of operations to a minimum,  */
3606   sreal total = 0;
3607 };
3608
3609 /* Construct costs for a node with weight WEIGHT.  A higher weight
3610    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3611    optimizing for size rather than speed.  */
3612
3613 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3614   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3615 {
3616 }
3617
3618 bool
3619 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3620 {
3621   return depth == other.depth && total == other.total;
3622 }
3623
3624 bool
3625 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3626 {
3627   return !operator== (other);
3628 }
3629
3630 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3631    true if we are optimizing for size rather than speed.  */
3632
3633 bool
3634 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3635                                   bool is_for_size) const
3636 {
3637   if (is_for_size)
3638     {
3639       if (total != other.total)
3640         return total < other.total;
3641       return depth < other.depth;
3642     }
3643   else
3644     {
3645       if (depth != other.depth)
3646         return depth < other.depth;
3647       return total < other.total;
3648     }
3649 }
3650
3651 /* Increase the costs to account for something with cost INPUT_COST
3652    happening in parallel with the current costs.  */
3653
3654 void
3655 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3656 {
3657   depth = std::max (depth, input_cost.depth);
3658   total += input_cost.total;
3659 }
3660
3661 /* Increase the costs to account for something with cost INPUT_COST
3662    happening in series with the current costs.  */
3663
3664 void
3665 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3666 {
3667   depth += other.depth;
3668   total += other.total;
3669 }
3670
3671 /* Split the total cost among TIMES successors or predecessors.  */
3672
3673 void
3674 slpg_layout_cost::split (unsigned int times)
3675 {
3676   if (times > 1)
3677     total /= times;
3678 }
3679
3680 /* Information about one node in the SLP graph, for use during
3681    vect_optimize_slp_pass.  */
3682
3683 struct slpg_vertex
3684 {
3685   slpg_vertex (slp_tree node_) : node (node_) {}
3686
3687   /* The node itself.  */
3688   slp_tree node;
3689
3690   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3691      partitions are flexible; they can have whichever layout consumers
3692      want them to have.  */
3693   int partition = -1;
3694
3695   /* The number of nodes that directly use the result of this one
3696      (i.e. the number of nodes that count this one as a child).  */
3697   unsigned int out_degree = 0;
3698
3699   /* The execution frequency of the node.  */
3700   sreal weight = 0;
3701
3702   /* The total execution frequency of all nodes that directly use the
3703      result of this one.  */
3704   sreal out_weight = 0;
3705 };
3706
3707 /* Information about one partition of the SLP graph, for use during
3708    vect_optimize_slp_pass.  */
3709
3710 struct slpg_partition_info
3711 {
3712   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3713      of m_partitioned_nodes.  */
3714   unsigned int node_begin = 0;
3715   unsigned int node_end = 0;
3716
3717   /* Which layout we've chosen to use for this partition, or -1 if
3718      we haven't picked one yet.  */
3719   int layout = -1;
3720
3721   /* The number of predecessors and successors in the partition dag.
3722      The predecessors always have lower partition numbers and the
3723      successors always have higher partition numbers.
3724
3725      Note that the directions of these edges are not necessarily the
3726      same as in the data flow graph.  For example, if an SCC has separate
3727      partitions for an inner loop and an outer loop, the inner loop's
3728      partition will have at least two incoming edges from the outer loop's
3729      partition: one for a live-in value and one for a live-out value.
3730      In data flow terms, one of these edges would also be from the outer loop
3731      to the inner loop, but the other would be in the opposite direction.  */
3732   unsigned int in_degree = 0;
3733   unsigned int out_degree = 0;
3734 };
3735
3736 /* Information about the costs of using a particular layout for a
3737    particular partition.  It can also say that the combination is
3738    impossible.  */
3739
3740 struct slpg_partition_layout_costs
3741 {
3742   bool is_possible () const { return internal_cost.is_possible (); }
3743   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3744
3745   /* The costs inherited from predecessor partitions.  */
3746   slpg_layout_cost in_cost;
3747
3748   /* The inherent cost of the layout within the node itself.  For example,
3749      this is nonzero for a load if choosing a particular layout would require
3750      the load to permute the loaded elements.  It is nonzero for a
3751      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3752      to full-vector moves.  */
3753   slpg_layout_cost internal_cost;
3754
3755   /* The costs inherited from successor partitions.  */
3756   slpg_layout_cost out_cost;
3757 };
3758
3759 /* This class tries to optimize the layout of vectors in order to avoid
3760    unnecessary shuffling.  At the moment, the set of possible layouts are
3761    restricted to bijective permutations.
3762
3763    The goal of the pass depends on whether we're optimizing for size or
3764    for speed.  When optimizing for size, the goal is to reduce the overall
3765    number of layout changes (including layout changes implied by things
3766    like load permutations).  When optimizing for speed, the goal is to
3767    reduce the maximum latency attributable to layout changes on any
3768    non-cyclical path through the data flow graph.
3769
3770    For example, when optimizing a loop nest for speed, we will prefer
3771    to make layout changes outside of a loop rather than inside of a loop,
3772    and will prefer to make layout changes in parallel rather than serially,
3773    even if that increases the overall number of layout changes.
3774
3775    The high-level procedure is:
3776
3777    (1) Build a graph in which edges go from uses (parents) to definitions
3778        (children).
3779
3780    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3781
3782    (3) When optimizing for speed, partition the nodes in each SCC based
3783        on their containing cfg loop.  When optimizing for size, treat
3784        each SCC as a single partition.
3785
3786        This gives us a dag of partitions.  The goal is now to assign a
3787        layout to each partition.
3788
3789    (4) Construct a set of vector layouts that are worth considering.
3790        Record which nodes must keep their current layout.
3791
3792    (5) Perform a forward walk over the partition dag (from loads to stores)
3793        accumulating the "forward" cost of using each layout.  When visiting
3794        each partition, assign a tentative choice of layout to the partition
3795        and use that choice when calculating the cost of using a different
3796        layout in successor partitions.
3797
3798    (6) Perform a backward walk over the partition dag (from stores to loads),
3799        accumulating the "backward" cost of using each layout.  When visiting
3800        each partition, make a final choice of layout for that partition based
3801        on the accumulated forward costs (from (5)) and backward costs
3802        (from (6)).
3803
3804    (7) Apply the chosen layouts to the SLP graph.
3805
3806    For example, consider the SLP statements:
3807
3808    S1:      a_1 = load
3809        loop:
3810    S2:      a_2 = PHI<a_1, a_3>
3811    S3:      b_1 = load
3812    S4:      a_3 = a_2 + b_1
3813        exit:
3814    S5:      a_4 = PHI<a_3>
3815    S6:      store a_4
3816
3817    S2 and S4 form an SCC and are part of the same loop.  Every other
3818    statement is in a singleton SCC.  In this example there is a one-to-one
3819    mapping between SCCs and partitions and the partition dag looks like this;
3820
3821         S1     S3
3822          \     /
3823           S2+S4
3824             |
3825            S5
3826             |
3827            S6
3828
3829    S2, S3 and S4 will have a higher execution frequency than the other
3830    statements, so when optimizing for speed, the goal is to avoid any
3831    layout changes:
3832
3833    - within S3
3834    - within S2+S4
3835    - on the S3->S2+S4 edge
3836
3837    For example, if S3 was originally a reversing load, the goal of the
3838    pass is to make it an unreversed load and change the layout on the
3839    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3840    on S1->S2+S4 and S5->S6 would also be acceptable.)
3841
3842    The difference between SCCs and partitions becomes important if we
3843    add an outer loop:
3844
3845    S1:      a_1 = ...
3846        loop1:
3847    S2:      a_2 = PHI<a_1, a_6>
3848    S3:      b_1 = load
3849    S4:      a_3 = a_2 + b_1
3850        loop2:
3851    S5:      a_4 = PHI<a_3, a_5>
3852    S6:      c_1 = load
3853    S7:      a_5 = a_4 + c_1
3854        exit2:
3855    S8:      a_6 = PHI<a_5>
3856    S9:      store a_6
3857        exit1:
3858
3859    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3860    for speed, we usually do not want restrictions in the outer loop to "infect"
3861    the decision for the inner loop.  For example, if an outer-loop node
3862    in the SCC contains a statement with a fixed layout, that should not
3863    prevent the inner loop from using a different layout.  Conversely,
3864    the inner loop should not dictate a layout to the outer loop: if the
3865    outer loop does a lot of computation, then it may not be efficient to
3866    do all of that computation in the inner loop's preferred layout.
3867
3868    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3869    and S5+S7 (inner).  We also try to arrange partitions so that:
3870
3871    - the partition for an outer loop comes before the partition for
3872      an inner loop
3873
3874    - if a sibling loop A dominates a sibling loop B, A's partition
3875      comes before B's
3876
3877    This gives the following partition dag for the example above:
3878
3879         S1        S3
3880          \        /
3881           S2+S4+S8   S6
3882            |   \\    /
3883            |    S5+S7
3884            |
3885           S9
3886
3887    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3888    one for a reversal of the edge S7->S8.
3889
3890    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3891    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3892    preferred layout against the cost of changing the layout on entry to the
3893    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3894
3895    Although this works well when optimizing for speed, it has the downside
3896    when optimizing for size that the choice of layout for S5+S7 is completely
3897    independent of S9, which lessens the chance of reducing the overall number
3898    of permutations.  We therefore do not partition SCCs when optimizing
3899    for size.
3900
3901    To give a concrete example of the difference between optimizing
3902    for size and speed, consider:
3903
3904    a[0] = (b[1] << c[3]) - d[1];
3905    a[1] = (b[0] << c[2]) - d[0];
3906    a[2] = (b[3] << c[1]) - d[3];
3907    a[3] = (b[2] << c[0]) - d[2];
3908
3909    There are three different layouts here: one for a, one for b and d,
3910    and one for c.  When optimizing for speed it is better to permute each
3911    of b, c and d into the order required by a, since those permutations
3912    happen in parallel.  But when optimizing for size, it is better to:
3913
3914    - permute c into the same order as b
3915    - do the arithmetic
3916    - permute the result into the order required by a
3917
3918    This gives 2 permutations rather than 3.  */
3919
3920 class vect_optimize_slp_pass
3921 {
3922 public:
3923   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3924   void run ();
3925
3926 private:
3927   /* Graph building.  */
3928   struct loop *containing_loop (slp_tree);
3929   bool is_cfg_latch_edge (graph_edge *);
3930   void build_vertices (hash_set<slp_tree> &, slp_tree);
3931   void build_vertices ();
3932   void build_graph ();
3933
3934   /* Partitioning.  */
3935   void create_partitions ();
3936   template<typename T> void for_each_partition_edge (unsigned int, T);
3937
3938   /* Layout selection.  */
3939   bool is_compatible_layout (slp_tree, unsigned int);
3940   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3941   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3942                                                        unsigned int);
3943   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3944                                int, unsigned int);
3945   int internal_node_cost (slp_tree, int, unsigned int);
3946   void start_choosing_layouts ();
3947
3948   /* Cost propagation.  */
3949   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3950                                      unsigned int, unsigned int);
3951   slpg_layout_cost total_in_cost (unsigned int);
3952   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3953   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3954   void forward_pass ();
3955   void backward_pass ();
3956
3957   /* Rematerialization.  */
3958   slp_tree get_result_with_layout (slp_tree, unsigned int);
3959   void materialize ();
3960
3961   /* Clean-up.  */
3962   void remove_redundant_permutations ();
3963
3964   void dump ();
3965
3966   vec_info *m_vinfo;
3967
3968   /* True if we should optimize the graph for size, false if we should
3969      optimize it for speed.  (It wouldn't be easy to make this decision
3970      more locally.)  */
3971   bool m_optimize_size;
3972
3973   /* A graph of all SLP nodes, with edges leading from uses to definitions.
3974      In other words, a node's predecessors are its slp_tree parents and
3975      a node's successors are its slp_tree children.  */
3976   graph *m_slpg = nullptr;
3977
3978   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
3979   auto_vec<slpg_vertex> m_vertices;
3980
3981   /* The list of all leaves of M_SLPG. such as external definitions, constants,
3982      and loads.  */
3983   auto_vec<int> m_leafs;
3984
3985   /* This array has one entry for every vector layout that we're considering.
3986      Element 0 is null and indicates "no change".  Other entries describe
3987      permutations that are inherent in the current graph and that we would
3988      like to reverse if possible.
3989
3990      For example, a permutation { 1, 2, 3, 0 } means that something has
3991      effectively been permuted in that way, such as a load group
3992      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3993      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3994      in order to put things "back" in order.  */
3995   auto_vec<vec<unsigned> > m_perms;
3996
3997   /* A partitioning of the nodes for which a layout must be chosen.
3998      Each partition represents an <SCC, cfg loop> pair; that is,
3999      nodes in different SCCs belong to different partitions, and nodes
4000      within an SCC can be further partitioned according to a containing
4001      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4002
4003      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4004        from leaves (such as loads) to roots (such as stores).
4005
4006      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4007   auto_vec<slpg_partition_info> m_partitions;
4008
4009   /* The list of all nodes for which a layout must be chosen.  Nodes for
4010      partition P come before the nodes for partition P+1.  Nodes within a
4011      partition are in reverse postorder.  */
4012   auto_vec<unsigned int> m_partitioned_nodes;
4013
4014   /* Index P * num-layouts + L contains the cost of using layout L
4015      for partition P.  */
4016   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4017
4018   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4019      original output of node N adjusted to have layout L.  */
4020   auto_vec<slp_tree> m_node_layouts;
4021 };
4022
4023 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4024    Also record whether we should optimize anything for speed rather
4025    than size.  */
4026
4027 void
4028 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4029                                         slp_tree node)
4030 {
4031   unsigned i;
4032   slp_tree child;
4033
4034   if (visited.add (node))
4035     return;
4036
4037   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4038     {
4039       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4040       if (optimize_bb_for_speed_p (bb))
4041         m_optimize_size = false;
4042     }
4043
4044   node->vertex = m_vertices.length ();
4045   m_vertices.safe_push (slpg_vertex (node));
4046
4047   bool leaf = true;
4048   bool force_leaf = false;
4049   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4050     if (child)
4051       {
4052         leaf = false;
4053         build_vertices (visited, child);
4054       }
4055     else
4056       force_leaf = true;
4057   /* Since SLP discovery works along use-def edges all cycles have an
4058      entry - but there's the exception of cycles where we do not handle
4059      the entry explicitely (but with a NULL SLP node), like some reductions
4060      and inductions.  Force those SLP PHIs to act as leafs to make them
4061      backwards reachable.  */
4062   if (leaf || force_leaf)
4063     m_leafs.safe_push (node->vertex);
4064 }
4065
4066 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4067
4068 void
4069 vect_optimize_slp_pass::build_vertices ()
4070 {
4071   hash_set<slp_tree> visited;
4072   unsigned i;
4073   slp_instance instance;
4074   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4075     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4076 }
4077
4078 /* Apply (reverse) bijectite PERM to VEC.  */
4079
4080 template <class T>
4081 static void
4082 vect_slp_permute (vec<unsigned> perm,
4083                   vec<T> &vec, bool reverse)
4084 {
4085   auto_vec<T, 64> saved;
4086   saved.create (vec.length ());
4087   for (unsigned i = 0; i < vec.length (); ++i)
4088     saved.quick_push (vec[i]);
4089
4090   if (reverse)
4091     {
4092       for (unsigned i = 0; i < vec.length (); ++i)
4093         vec[perm[i]] = saved[i];
4094       for (unsigned i = 0; i < vec.length (); ++i)
4095         gcc_assert (vec[perm[i]] == saved[i]);
4096     }
4097   else
4098     {
4099       for (unsigned i = 0; i < vec.length (); ++i)
4100         vec[i] = saved[perm[i]];
4101       for (unsigned i = 0; i < vec.length (); ++i)
4102         gcc_assert (vec[i] == saved[perm[i]]);
4103     }
4104 }
4105
4106 /* Return the cfg loop that contains NODE.  */
4107
4108 struct loop *
4109 vect_optimize_slp_pass::containing_loop (slp_tree node)
4110 {
4111   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4112   if (!rep)
4113     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4114   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4115 }
4116
4117 /* Return true if UD (an edge from a use to a definition) is associated
4118    with a loop latch edge in the cfg.  */
4119
4120 bool
4121 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4122 {
4123   slp_tree use = m_vertices[ud->src].node;
4124   slp_tree def = m_vertices[ud->dest].node;
4125   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4126       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4127     return false;
4128
4129   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4130   return (is_a<gphi *> (use_rep->stmt)
4131           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4132           && containing_loop (def) == containing_loop (use));
4133 }
4134
4135 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4136    a nonnull data field.  */
4137
4138 void
4139 vect_optimize_slp_pass::build_graph ()
4140 {
4141   m_optimize_size = true;
4142   build_vertices ();
4143
4144   m_slpg = new_graph (m_vertices.length ());
4145   for (slpg_vertex &v : m_vertices)
4146     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4147       if (child)
4148         {
4149           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4150           if (is_cfg_latch_edge (ud))
4151             ud->data = this;
4152         }
4153 }
4154
4155 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4156
4157 static bool
4158 skip_cfg_latch_edges (graph_edge *e)
4159 {
4160   return e->data;
4161 }
4162
4163 /* Create the node partitions.  */
4164
4165 void
4166 vect_optimize_slp_pass::create_partitions ()
4167 {
4168   /* Calculate a postorder of the graph, ignoring edges that correspond
4169      to natural latch edges in the cfg.  Reading the vector from the end
4170      to the beginning gives the reverse postorder.  */
4171   auto_vec<int> initial_rpo;
4172   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4173                false, NULL, skip_cfg_latch_edges);
4174   gcc_assert (initial_rpo.length () == m_vertices.length ());
4175
4176   /* Calculate the strongly connected components of the graph.  */
4177   auto_vec<int> scc_grouping;
4178   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4179
4180   /* Create a new index order in which all nodes from the same SCC are
4181      consecutive.  Use scc_pos to record the index of the first node in
4182      each SCC.  */
4183   auto_vec<unsigned int> scc_pos (num_sccs);
4184   int last_component = -1;
4185   unsigned int node_count = 0;
4186   for (unsigned int node_i : scc_grouping)
4187     {
4188       if (last_component != m_slpg->vertices[node_i].component)
4189         {
4190           last_component = m_slpg->vertices[node_i].component;
4191           gcc_assert (last_component == int (scc_pos.length ()));
4192           scc_pos.quick_push (node_count);
4193         }
4194       node_count += 1;
4195     }
4196   gcc_assert (node_count == initial_rpo.length ()
4197               && last_component + 1 == int (num_sccs));
4198
4199   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4200      inside each SCC following the RPO we calculated above.  The fact that
4201      we ignored natural latch edges when calculating the RPO should ensure
4202      that, for natural loop nests:
4203
4204      - the first node that we encounter in a cfg loop is the loop header phi
4205      - the loop header phis are in dominance order
4206
4207      Arranging for this is an optimization (see below) rather than a
4208      correctness issue.  Unnatural loops with a tangled mess of backedges
4209      will still work correctly, but might give poorer results.
4210
4211      Also update scc_pos so that it gives 1 + the index of the last node
4212      in the SCC.  */
4213   m_partitioned_nodes.safe_grow (node_count);
4214   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4215     {
4216       unsigned int node_i = initial_rpo[old_i];
4217       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4218       m_partitioned_nodes[new_i] = node_i;
4219     }
4220
4221   /* When optimizing for speed, partition each SCC based on the containing
4222      cfg loop. The order we constructed above should ensure that, for natural
4223      cfg loops, we'll create sub-SCC partitions for outer loops before
4224      the corresponding sub-SCC partitions for inner loops.  Similarly,
4225      when one sibling loop A dominates another sibling loop B, we should
4226      create a sub-SCC partition for A before a sub-SCC partition for B.
4227
4228      As above, nothing depends for correctness on whether this achieves
4229      a natural nesting, but we should get better results when it does.  */
4230   m_partitions.reserve (m_vertices.length ());
4231   unsigned int next_partition_i = 0;
4232   hash_map<struct loop *, int> loop_partitions;
4233   unsigned int rpo_begin = 0;
4234   unsigned int num_partitioned_nodes = 0;
4235   for (unsigned int rpo_end : scc_pos)
4236     {
4237       loop_partitions.empty ();
4238       unsigned int partition_i = next_partition_i;
4239       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4240         {
4241           /* Handle externals and constants optimistically throughout.
4242              But treat existing vectors as fixed since we do not handle
4243              permuting them.  */
4244           unsigned int node_i = m_partitioned_nodes[rpo_i];
4245           auto &vertex = m_vertices[node_i];
4246           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4247                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4248               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4249             vertex.partition = -1;
4250           else
4251             {
4252               bool existed;
4253               if (m_optimize_size)
4254                 existed = next_partition_i > partition_i;
4255               else
4256                 {
4257                   struct loop *loop = containing_loop (vertex.node);
4258                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4259                   if (!existed)
4260                     entry = next_partition_i;
4261                   partition_i = entry;
4262                 }
4263               if (!existed)
4264                 {
4265                   m_partitions.quick_push (slpg_partition_info ());
4266                   next_partition_i += 1;
4267                 }
4268               vertex.partition = partition_i;
4269               num_partitioned_nodes += 1;
4270               m_partitions[partition_i].node_end += 1;
4271             }
4272         }
4273       rpo_begin = rpo_end;
4274     }
4275
4276   /* Assign ranges of consecutive node indices to each partition,
4277      in partition order.  Start with node_end being the same as
4278      node_begin so that the next loop can use it as a counter.  */
4279   unsigned int node_begin = 0;
4280   for (auto &partition : m_partitions)
4281     {
4282       partition.node_begin = node_begin;
4283       node_begin += partition.node_end;
4284       partition.node_end = partition.node_begin;
4285     }
4286   gcc_assert (node_begin == num_partitioned_nodes);
4287
4288   /* Finally build the list of nodes in partition order.  */
4289   m_partitioned_nodes.truncate (num_partitioned_nodes);
4290   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4291     {
4292       int partition_i = m_vertices[node_i].partition;
4293       if (partition_i >= 0)
4294         {
4295           unsigned int order_i = m_partitions[partition_i].node_end++;
4296           m_partitioned_nodes[order_i] = node_i;
4297         }
4298     }
4299 }
4300
4301 /* Look for edges from earlier partitions into node NODE_I and edges from
4302    node NODE_I into later partitions.  Call:
4303
4304       FN (ud, other_node_i)
4305
4306    for each such use-to-def edge ud, where other_node_i is the node at the
4307    other end of the edge.  */
4308
4309 template<typename T>
4310 void
4311 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4312 {
4313   int partition_i = m_vertices[node_i].partition;
4314   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4315        pred; pred = pred->pred_next)
4316     {
4317       int src_partition_i = m_vertices[pred->src].partition;
4318       if (src_partition_i >= 0 && src_partition_i != partition_i)
4319         fn (pred, pred->src);
4320     }
4321   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4322        succ; succ = succ->succ_next)
4323     {
4324       int dest_partition_i = m_vertices[succ->dest].partition;
4325       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4326         fn (succ, succ->dest);
4327     }
4328 }
4329
4330 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4331    that NODE would operate on.  This test is independent of NODE's actual
4332    operation.  */
4333
4334 bool
4335 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4336                                               unsigned int layout_i)
4337 {
4338   if (layout_i == 0)
4339     return true;
4340
4341   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4342     return false;
4343
4344   return true;
4345 }
4346
4347 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4348    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4349    layouts is incompatible with NODE or if the change is not possible for
4350    some other reason.
4351
4352    The properties taken from NODE include the number of lanes and the
4353    vector type.  The actual operation doesn't matter.  */
4354
4355 int
4356 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4357                                             unsigned int from_layout_i,
4358                                             unsigned int to_layout_i)
4359 {
4360   if (!is_compatible_layout (node, from_layout_i)
4361       || !is_compatible_layout (node, to_layout_i))
4362     return -1;
4363
4364   if (from_layout_i == to_layout_i)
4365     return 0;
4366
4367   auto_vec<slp_tree, 1> children (1);
4368   children.quick_push (node);
4369   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4370   if (from_layout_i > 0)
4371     for (unsigned int i : m_perms[from_layout_i])
4372       perm.quick_push ({ 0, i });
4373   else
4374     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4375       perm.quick_push ({ 0, i });
4376   if (to_layout_i > 0)
4377     vect_slp_permute (m_perms[to_layout_i], perm, true);
4378   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4379                                                children, false);
4380   if (count >= 0)
4381     return MAX (count, 1);
4382
4383   /* ??? In principle we could try changing via layout 0, giving two
4384      layout changes rather than 1.  Doing that would require
4385      corresponding support in get_result_with_layout.  */
4386   return -1;
4387 }
4388
4389 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4390
4391 inline slpg_partition_layout_costs &
4392 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4393                                                 unsigned int layout_i)
4394 {
4395   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4396 }
4397
4398 /* Change PERM in one of two ways:
4399
4400    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4401      chosen for child I of NODE.
4402
4403    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4404
4405    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4406
4407 void
4408 vect_optimize_slp_pass::
4409 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4410                         int in_layout_i, unsigned int out_layout_i)
4411 {
4412   for (auto &entry : perm)
4413     {
4414       int this_in_layout_i = in_layout_i;
4415       if (this_in_layout_i < 0)
4416         {
4417           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4418           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4419           this_in_layout_i = m_partitions[in_partition_i].layout;
4420         }
4421       if (this_in_layout_i > 0)
4422         entry.second = m_perms[this_in_layout_i][entry.second];
4423     }
4424   if (out_layout_i > 0)
4425     vect_slp_permute (m_perms[out_layout_i], perm, true);
4426 }
4427
4428 /* Check whether the target allows NODE to be rearranged so that the node's
4429    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4430    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4431
4432    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4433    NODE can adapt to the layout changes that have (perhaps provisionally)
4434    been chosen for NODE's children, so that no extra permutations are
4435    needed on either the input or the output of NODE.
4436
4437    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4438    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4439
4440    IN_LAYOUT_I has no meaning for other types of node.
4441
4442    Keeping the node as-is is always valid.  If the target doesn't appear
4443    to support the node as-is, but might realistically support other layouts,
4444    then layout 0 instead has the cost of a worst-case permutation.  On the
4445    one hand, this ensures that every node has at least one valid layout,
4446    avoiding what would otherwise be an awkward special case.  On the other,
4447    it still encourages the pass to change an invalid pre-existing layout
4448    choice into a valid one.  */
4449
4450 int
4451 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4452                                             unsigned int out_layout_i)
4453 {
4454   const int fallback_cost = 1;
4455
4456   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4457     {
4458       auto_lane_permutation_t tmp_perm;
4459       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4460
4461       /* Check that the child nodes support the chosen layout.  Checking
4462          the first child is enough, since any second child would have the
4463          same shape.  */
4464       auto first_child = SLP_TREE_CHILDREN (node)[0];
4465       if (in_layout_i > 0
4466           && !is_compatible_layout (first_child, in_layout_i))
4467         return -1;
4468
4469       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4470       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4471                                                   node, tmp_perm,
4472                                                   SLP_TREE_CHILDREN (node),
4473                                                   false);
4474       if (count < 0)
4475         {
4476           if (in_layout_i == 0 && out_layout_i == 0)
4477             {
4478               /* Use the fallback cost if the node could in principle support
4479                  some nonzero layout for both the inputs and the outputs.
4480                  Otherwise assume that the node will be rejected later
4481                  and rebuilt from scalars.  */
4482               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4483                 return fallback_cost;
4484               return 0;
4485             }
4486           return -1;
4487         }
4488
4489       /* We currently have no way of telling whether the new layout is cheaper
4490          or more expensive than the old one.  But at least in principle,
4491          it should be worth making zero permutations (whole-vector shuffles)
4492          cheaper than real permutations, in case the pass is able to remove
4493          the latter.  */
4494       return count == 0 ? 0 : 1;
4495     }
4496
4497   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4498   if (rep
4499       && STMT_VINFO_DATA_REF (rep)
4500       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4501       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4502     {
4503       auto_load_permutation_t tmp_perm;
4504       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4505       if (out_layout_i > 0)
4506         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4507
4508       poly_uint64 vf = 1;
4509       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4510         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4511       unsigned int n_perms;
4512       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4513                                            nullptr, vf, true, false, &n_perms))
4514         {
4515           auto rep = SLP_TREE_REPRESENTATIVE (node);
4516           if (out_layout_i == 0)
4517             {
4518               /* Use the fallback cost if the load is an N-to-N permutation.
4519                  Otherwise assume that the node will be rejected later
4520                  and rebuilt from scalars.  */
4521               if (STMT_VINFO_GROUPED_ACCESS (rep)
4522                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4523                       == SLP_TREE_LANES (node)))
4524                 return fallback_cost;
4525               return 0;
4526             }
4527           return -1;
4528         }
4529
4530       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4531       return n_perms == 0 ? 0 : 1;
4532     }
4533
4534   return 0;
4535 }
4536
4537 /* Decide which element layouts we should consider using.  Calculate the
4538    weights associated with inserting layout changes on partition edges.
4539    Also mark partitions that cannot change layout, by setting their
4540    layout to zero.  */
4541
4542 void
4543 vect_optimize_slp_pass::start_choosing_layouts ()
4544 {
4545   /* Used to assign unique permutation indices.  */
4546   using perm_hash = unbounded_hashmap_traits<
4547     vec_free_hash_base<int_hash_base<unsigned>>,
4548     int_hash<int, -1, -2>
4549   >;
4550   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4551
4552   /* Layout 0 is "no change".  */
4553   m_perms.safe_push (vNULL);
4554
4555   /* Create layouts from existing permutations.  */
4556   auto_load_permutation_t tmp_perm;
4557   for (unsigned int node_i : m_partitioned_nodes)
4558     {
4559       /* Leafs also double as entries to the reverse graph.  Allow the
4560          layout of those to be changed.  */
4561       auto &vertex = m_vertices[node_i];
4562       auto &partition = m_partitions[vertex.partition];
4563       if (!m_slpg->vertices[node_i].succ)
4564         partition.layout = 0;
4565
4566       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4567       slp_tree node = vertex.node;
4568       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4569       slp_tree child;
4570       unsigned HOST_WIDE_INT imin, imax = 0;
4571       bool any_permute = false;
4572       tmp_perm.truncate (0);
4573       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4574         {
4575           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4576              unpermuted, record a layout that reverses this permutation.
4577
4578              We would need more work to cope with loads that are internally
4579              permuted and also have inputs (such as masks for
4580              IFN_MASK_LOADs).  */
4581           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4582           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4583             continue;
4584           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4585           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4586           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4587         }
4588       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4589                && SLP_TREE_CHILDREN (node).length () == 1
4590                && (child = SLP_TREE_CHILDREN (node)[0])
4591                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4592                    .is_constant (&imin)))
4593         {
4594           /* If the child has the same vector size as this node,
4595              reversing the permutation can make the permutation a no-op.
4596              In other cases it can change a true permutation into a
4597              full-vector extract.  */
4598           tmp_perm.reserve (SLP_TREE_LANES (node));
4599           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4600             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4601         }
4602       else
4603         continue;
4604
4605       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4606         {
4607           unsigned idx = tmp_perm[j];
4608           imin = MIN (imin, idx);
4609           imax = MAX (imax, idx);
4610           if (idx - tmp_perm[0] != j)
4611             any_permute = true;
4612         }
4613       /* If the span doesn't match we'd disrupt VF computation, avoid
4614          that for now.  */
4615       if (imax - imin + 1 != SLP_TREE_LANES (node))
4616         continue;
4617       /* If there's no permute no need to split one out.  In this case
4618          we can consider turning a load into a permuted load, if that
4619          turns out to be cheaper than alternatives.  */
4620       if (!any_permute)
4621         {
4622           partition.layout = -1;
4623           continue;
4624         }
4625
4626       /* For now only handle true permutes, like
4627          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4628          when permuting constants and invariants keeping the permute
4629          bijective.  */
4630       auto_sbitmap load_index (SLP_TREE_LANES (node));
4631       bitmap_clear (load_index);
4632       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4633         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4634       unsigned j;
4635       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4636         if (!bitmap_bit_p (load_index, j))
4637           break;
4638       if (j != SLP_TREE_LANES (node))
4639         continue;
4640
4641       vec<unsigned> perm = vNULL;
4642       perm.safe_grow (SLP_TREE_LANES (node), true);
4643       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4644         perm[j] = tmp_perm[j] - imin;
4645
4646       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4647         {
4648           /* Continue to use existing layouts, but don't add any more.  */
4649           int *entry = layout_ids.get (perm);
4650           partition.layout = entry ? *entry : 0;
4651           perm.release ();
4652         }
4653       else
4654         {
4655           bool existed;
4656           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4657           if (existed)
4658             perm.release ();
4659           else
4660             {
4661               layout_i = m_perms.length ();
4662               m_perms.safe_push (perm);
4663             }
4664           partition.layout = layout_i;
4665         }
4666     }
4667
4668   /* Initially assume that every layout is possible and has zero cost
4669      in every partition.  */
4670   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4671                                               * m_perms.length ());
4672
4673   /* We have to mark outgoing permutations facing non-reduction graph
4674      entries that are not represented as to be materialized.  */
4675   for (slp_instance instance : m_vinfo->slp_instances)
4676     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4677       {
4678         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4679         m_partitions[m_vertices[node_i].partition].layout = 0;
4680       }
4681
4682   /* Check which layouts each node and partition can handle.  Calculate the
4683      weights associated with inserting layout changes on edges.  */
4684   for (unsigned int node_i : m_partitioned_nodes)
4685     {
4686       auto &vertex = m_vertices[node_i];
4687       auto &partition = m_partitions[vertex.partition];
4688       slp_tree node = vertex.node;
4689
4690       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4691         {
4692           vertex.weight = vect_slp_node_weight (node);
4693
4694           /* We do not handle stores with a permutation, so all
4695              incoming permutations must have been materialized.
4696
4697              We also don't handle masked grouped loads, which lack a
4698              permutation vector.  In this case the memory locations
4699              form an implicit second input to the loads, on top of the
4700              explicit mask input, and the memory input's layout cannot
4701              be changed.
4702
4703              On the other hand, we do support permuting gather loads and
4704              masked gather loads, where each scalar load is independent
4705              of the others.  This can be useful if the address/index input
4706              benefits from permutation.  */
4707           if (STMT_VINFO_DATA_REF (rep)
4708               && STMT_VINFO_GROUPED_ACCESS (rep)
4709               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4710             partition.layout = 0;
4711
4712           /* We cannot change the layout of an operation that is
4713              not independent on lanes.  Note this is an explicit
4714              negative list since that's much shorter than the respective
4715              positive one but it's critical to keep maintaining it.  */
4716           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4717             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4718               {
4719               case CFN_COMPLEX_ADD_ROT90:
4720               case CFN_COMPLEX_ADD_ROT270:
4721               case CFN_COMPLEX_MUL:
4722               case CFN_COMPLEX_MUL_CONJ:
4723               case CFN_VEC_ADDSUB:
4724               case CFN_VEC_FMADDSUB:
4725               case CFN_VEC_FMSUBADD:
4726                 partition.layout = 0;
4727               default:;
4728               }
4729         }
4730
4731       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4732         {
4733           auto &other_vertex = m_vertices[other_node_i];
4734
4735           /* Count the number of edges from earlier partitions and the number
4736              of edges to later partitions.  */
4737           if (other_vertex.partition < vertex.partition)
4738             partition.in_degree += 1;
4739           else
4740             partition.out_degree += 1;
4741
4742           /* If the current node uses the result of OTHER_NODE_I, accumulate
4743              the effects of that.  */
4744           if (ud->src == int (node_i))
4745             {
4746               other_vertex.out_weight += vertex.weight;
4747               other_vertex.out_degree += 1;
4748             }
4749         };
4750       for_each_partition_edge (node_i, process_edge);
4751     }
4752 }
4753
4754 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4755    its current (provisional) choice of layout.  The inputs do not necessarily
4756    have the same layout as each other.  */
4757
4758 slpg_layout_cost
4759 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4760 {
4761   auto &vertex = m_vertices[node_i];
4762   slpg_layout_cost cost;
4763   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4764     {
4765       auto &other_vertex = m_vertices[other_node_i];
4766       if (other_vertex.partition < vertex.partition)
4767         {
4768           auto &other_partition = m_partitions[other_vertex.partition];
4769           auto &other_costs = partition_layout_costs (other_vertex.partition,
4770                                                       other_partition.layout);
4771           slpg_layout_cost this_cost = other_costs.in_cost;
4772           this_cost.add_serial_cost (other_costs.internal_cost);
4773           this_cost.split (other_partition.out_degree);
4774           cost.add_parallel_cost (this_cost);
4775         }
4776     };
4777   for_each_partition_edge (node_i, add_cost);
4778   return cost;
4779 }
4780
4781 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4782    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4783    slpg_layout_cost::impossible () if the change isn't possible.  */
4784
4785 slpg_layout_cost
4786 vect_optimize_slp_pass::
4787 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4788                   unsigned int layout2_i)
4789 {
4790   auto &def_vertex = m_vertices[ud->dest];
4791   auto &use_vertex = m_vertices[ud->src];
4792   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4793   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4794   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4795                                     use_layout_i);
4796   if (factor < 0)
4797     return slpg_layout_cost::impossible ();
4798
4799   /* We have a choice of putting the layout change at the site of the
4800      definition or at the site of the use.  Prefer the former when
4801      optimizing for size or when the execution frequency of the
4802      definition is no greater than the combined execution frequencies of
4803      the uses.  When putting the layout change at the site of the definition,
4804      divvy up the cost among all consumers.  */
4805   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4806     {
4807       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4808       cost.split (def_vertex.out_degree);
4809       return cost;
4810     }
4811   return { use_vertex.weight * factor, m_optimize_size };
4812 }
4813
4814 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4815    partition; FROM_NODE_I could be the definition node or the use node.
4816    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4817    Return the cost of any necessary fix-ups on edge UD, or return
4818    slpg_layout_cost::impossible () if the change isn't possible.
4819
4820    At this point, FROM_NODE_I's partition has chosen the cheapest
4821    layout based on the information available so far, but this choice
4822    is only provisional.  */
4823
4824 slpg_layout_cost
4825 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4826                                       unsigned int to_layout_i)
4827 {
4828   auto &from_vertex = m_vertices[from_node_i];
4829   unsigned int from_partition_i = from_vertex.partition;
4830   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4831   gcc_assert (from_partition.layout >= 0);
4832
4833   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4834      with its current layout preference.  */
4835   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4836   auto edge_cost = edge_layout_cost (ud, from_node_i,
4837                                      from_partition.layout, to_layout_i);
4838   if (edge_cost.is_possible ())
4839     {
4840       auto &from_costs = partition_layout_costs (from_partition_i,
4841                                                  from_partition.layout);
4842       cost = from_costs.in_cost;
4843       cost.add_serial_cost (from_costs.internal_cost);
4844       cost.split (from_partition.out_degree);
4845       cost.add_serial_cost (edge_cost);
4846     }
4847
4848   /* Take the minimum of that cost and the cost that applies if
4849      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4850   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4851                                                       to_layout_i);
4852   if (direct_layout_costs.is_possible ())
4853     {
4854       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4855       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4856       direct_cost.split (from_partition.out_degree);
4857       if (!cost.is_possible ()
4858           || direct_cost.is_better_than (cost, m_optimize_size))
4859         cost = direct_cost;
4860     }
4861
4862   return cost;
4863 }
4864
4865 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4866    partition; TO_NODE_I could be the definition node or the use node.
4867    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4868    return the cost of any necessary fix-ups on edge UD, or
4869    slpg_layout_cost::impossible () if the choice cannot be made.
4870
4871    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4872
4873 slpg_layout_cost
4874 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4875                                        unsigned int from_layout_i)
4876 {
4877   auto &to_vertex = m_vertices[to_node_i];
4878   unsigned int to_partition_i = to_vertex.partition;
4879   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4880   gcc_assert (to_partition.layout >= 0);
4881
4882   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4883      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4884      any other inputs keep their current choice of layout.  */
4885   auto &to_costs = partition_layout_costs (to_partition_i,
4886                                            to_partition.layout);
4887   if (ud->src == int (to_node_i)
4888       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4889     {
4890       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4891       auto old_layout = from_partition.layout;
4892       from_partition.layout = from_layout_i;
4893       int factor = internal_node_cost (to_vertex.node, -1,
4894                                        to_partition.layout);
4895       from_partition.layout = old_layout;
4896       if (factor >= 0)
4897         {
4898           slpg_layout_cost cost = to_costs.out_cost;
4899           cost.add_serial_cost ({ to_vertex.weight * factor,
4900                                   m_optimize_size });
4901           cost.split (to_partition.in_degree);
4902           return cost;
4903         }
4904     }
4905
4906   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4907   auto edge_cost = edge_layout_cost (ud, to_node_i,
4908                                      to_partition.layout, from_layout_i);
4909   if (edge_cost.is_possible ())
4910     {
4911       slpg_layout_cost cost = to_costs.out_cost;
4912       cost.add_serial_cost (to_costs.internal_cost);
4913       cost.split (to_partition.in_degree);
4914       cost.add_serial_cost (edge_cost);
4915       return cost;
4916     }
4917
4918   return slpg_layout_cost::impossible ();
4919 }
4920
4921 /* Make a forward pass through the partitions, accumulating input costs.
4922    Make a tentative (provisional) choice of layout for each partition,
4923    ensuring that this choice still allows later partitions to keep
4924    their original layout.  */
4925
4926 void
4927 vect_optimize_slp_pass::forward_pass ()
4928 {
4929   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4930        ++partition_i)
4931     {
4932       auto &partition = m_partitions[partition_i];
4933
4934       /* If the partition consists of a single VEC_PERM_EXPR, precompute
4935          the incoming cost that would apply if every predecessor partition
4936          keeps its current layout.  This is used within the loop below.  */
4937       slpg_layout_cost in_cost;
4938       slp_tree single_node = nullptr;
4939       if (partition.node_end == partition.node_begin + 1)
4940         {
4941           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4942           single_node = m_vertices[node_i].node;
4943           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4944             in_cost = total_in_cost (node_i);
4945         }
4946
4947       /* Go through the possible layouts.  Decide which ones are valid
4948          for this partition and record which of the valid layouts has
4949          the lowest cost.  */
4950       unsigned int min_layout_i = 0;
4951       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4952       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4953         {
4954           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4955           if (!layout_costs.is_possible ())
4956             continue;
4957
4958           /* If the recorded layout is already 0 then the layout cannot
4959              change.  */
4960           if (partition.layout == 0 && layout_i != 0)
4961             {
4962               layout_costs.mark_impossible ();
4963               continue;
4964             }
4965
4966           bool is_possible = true;
4967           for (unsigned int order_i = partition.node_begin;
4968                order_i < partition.node_end; ++order_i)
4969             {
4970               unsigned int node_i = m_partitioned_nodes[order_i];
4971               auto &vertex = m_vertices[node_i];
4972
4973               /* Reject the layout if it is individually incompatible
4974                  with any node in the partition.  */
4975               if (!is_compatible_layout (vertex.node, layout_i))
4976                 {
4977                   is_possible = false;
4978                   break;
4979                 }
4980
4981               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4982                 {
4983                   auto &other_vertex = m_vertices[other_node_i];
4984                   if (other_vertex.partition < vertex.partition)
4985                     {
4986                       /* Accumulate the incoming costs from earlier
4987                          partitions, plus the cost of any layout changes
4988                          on UD itself.  */
4989                       auto cost = forward_cost (ud, other_node_i, layout_i);
4990                       if (!cost.is_possible ())
4991                         is_possible = false;
4992                       else
4993                         layout_costs.in_cost.add_parallel_cost (cost);
4994                     }
4995                   else
4996                     /* Reject the layout if it would make layout 0 impossible
4997                        for later partitions.  This amounts to testing that the
4998                        target supports reversing the layout change on edges
4999                        to later partitions.
5000
5001                        In principle, it might be possible to push a layout
5002                        change all the way down a graph, so that it never
5003                        needs to be reversed and so that the target doesn't
5004                        need to support the reverse operation.  But it would
5005                        be awkward to bail out if we hit a partition that
5006                        does not support the new layout, especially since
5007                        we are not dealing with a lattice.  */
5008                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5009                                                      layout_i).is_possible ();
5010                 };
5011               for_each_partition_edge (node_i, add_cost);
5012
5013               /* Accumulate the cost of using LAYOUT_I within NODE,
5014                  both for the inputs and the outputs.  */
5015               int factor = internal_node_cost (vertex.node, layout_i,
5016                                                layout_i);
5017               if (factor < 0)
5018                 {
5019                   is_possible = false;
5020                   break;
5021                 }
5022               else if (factor)
5023                 layout_costs.internal_cost.add_serial_cost
5024                   ({ vertex.weight * factor, m_optimize_size });
5025             }
5026           if (!is_possible)
5027             {
5028               layout_costs.mark_impossible ();
5029               continue;
5030             }
5031
5032           /* Combine the incoming and partition-internal costs.  */
5033           slpg_layout_cost combined_cost = layout_costs.in_cost;
5034           combined_cost.add_serial_cost (layout_costs.internal_cost);
5035
5036           /* If this partition consists of a single VEC_PERM_EXPR, see
5037              if the VEC_PERM_EXPR can be changed to support output layout
5038              LAYOUT_I while keeping all the provisional choices of input
5039              layout.  */
5040           if (single_node
5041               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5042             {
5043               int factor = internal_node_cost (single_node, -1, layout_i);
5044               if (factor >= 0)
5045                 {
5046                   auto weight = m_vertices[single_node->vertex].weight;
5047                   slpg_layout_cost internal_cost
5048                     = { weight * factor, m_optimize_size };
5049
5050                   slpg_layout_cost alt_cost = in_cost;
5051                   alt_cost.add_serial_cost (internal_cost);
5052                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5053                     {
5054                       combined_cost = alt_cost;
5055                       layout_costs.in_cost = in_cost;
5056                       layout_costs.internal_cost = internal_cost;
5057                     }
5058                 }
5059             }
5060
5061           /* Record the layout with the lowest cost.  Prefer layout 0 in
5062              the event of a tie between it and another layout.  */
5063           if (!min_layout_cost.is_possible ()
5064               || combined_cost.is_better_than (min_layout_cost,
5065                                                m_optimize_size))
5066             {
5067               min_layout_i = layout_i;
5068               min_layout_cost = combined_cost;
5069             }
5070         }
5071
5072       /* This loop's handling of earlier partitions should ensure that
5073          choosing the original layout for the current partition is no
5074          less valid than it was in the original graph, even with the
5075          provisional layout choices for those earlier partitions.  */
5076       gcc_assert (min_layout_cost.is_possible ());
5077       partition.layout = min_layout_i;
5078     }
5079 }
5080
5081 /* Make a backward pass through the partitions, accumulating output costs.
5082    Make a final choice of layout for each partition.  */
5083
5084 void
5085 vect_optimize_slp_pass::backward_pass ()
5086 {
5087   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5088     {
5089       auto &partition = m_partitions[partition_i];
5090
5091       unsigned int min_layout_i = 0;
5092       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5093       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5094         {
5095           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5096           if (!layout_costs.is_possible ())
5097             continue;
5098
5099           /* Accumulate the costs from successor partitions.  */
5100           bool is_possible = true;
5101           for (unsigned int order_i = partition.node_begin;
5102                order_i < partition.node_end; ++order_i)
5103             {
5104               unsigned int node_i = m_partitioned_nodes[order_i];
5105               auto &vertex = m_vertices[node_i];
5106               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5107                 {
5108                   auto &other_vertex = m_vertices[other_node_i];
5109                   auto &other_partition = m_partitions[other_vertex.partition];
5110                   if (other_vertex.partition > vertex.partition)
5111                     {
5112                       /* Accumulate the incoming costs from later
5113                          partitions, plus the cost of any layout changes
5114                          on UD itself.  */
5115                       auto cost = backward_cost (ud, other_node_i, layout_i);
5116                       if (!cost.is_possible ())
5117                         is_possible = false;
5118                       else
5119                         layout_costs.out_cost.add_parallel_cost (cost);
5120                     }
5121                   else
5122                     /* Make sure that earlier partitions can (if necessary
5123                        or beneficial) keep the layout that they chose in
5124                        the forward pass.  This ensures that there is at
5125                        least one valid choice of layout.  */
5126                     is_possible &= edge_layout_cost (ud, other_node_i,
5127                                                      other_partition.layout,
5128                                                      layout_i).is_possible ();
5129                 };
5130               for_each_partition_edge (node_i, add_cost);
5131             }
5132           if (!is_possible)
5133             {
5134               layout_costs.mark_impossible ();
5135               continue;
5136             }
5137
5138           /* Locally combine the costs from the forward and backward passes.
5139              (This combined cost is not passed on, since that would lead
5140              to double counting.)  */
5141           slpg_layout_cost combined_cost = layout_costs.in_cost;
5142           combined_cost.add_serial_cost (layout_costs.internal_cost);
5143           combined_cost.add_serial_cost (layout_costs.out_cost);
5144
5145           /* Record the layout with the lowest cost.  Prefer layout 0 in
5146              the event of a tie between it and another layout.  */
5147           if (!min_layout_cost.is_possible ()
5148               || combined_cost.is_better_than (min_layout_cost,
5149                                                m_optimize_size))
5150             {
5151               min_layout_i = layout_i;
5152               min_layout_cost = combined_cost;
5153             }
5154         }
5155
5156       gcc_assert (min_layout_cost.is_possible ());
5157       partition.layout = min_layout_i;
5158     }
5159 }
5160
5161 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5162    NODE already has the layout that was selected for its partition.  */
5163
5164 slp_tree
5165 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5166                                                 unsigned int to_layout_i)
5167 {
5168   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5169   slp_tree result = m_node_layouts[result_i];
5170   if (result)
5171     return result;
5172
5173   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5174       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5175     {
5176       /* If the vector is uniform or unchanged, there's nothing to do.  */
5177       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5178         result = node;
5179       else
5180         {
5181           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5182           result = vect_create_new_slp_node (scalar_ops);
5183           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5184         }
5185     }
5186   else
5187     {
5188       unsigned int partition_i = m_vertices[node->vertex].partition;
5189       unsigned int from_layout_i = m_partitions[partition_i].layout;
5190       if (from_layout_i == to_layout_i)
5191         return node;
5192
5193       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5194          permutation instead of a serial one.  Leave the new permutation
5195          in TMP_PERM on success.  */
5196       auto_lane_permutation_t tmp_perm;
5197       unsigned int num_inputs = 1;
5198       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5199         {
5200           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5201           if (from_layout_i != 0)
5202             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5203           if (to_layout_i != 0)
5204             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5205           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5206                                               tmp_perm,
5207                                               SLP_TREE_CHILDREN (node),
5208                                               false) >= 0)
5209             num_inputs = SLP_TREE_CHILDREN (node).length ();
5210           else
5211             tmp_perm.truncate (0);
5212         }
5213
5214       if (dump_enabled_p ())
5215         {
5216           if (tmp_perm.length () > 0)
5217             dump_printf_loc (MSG_NOTE, vect_location,
5218                              "duplicating permutation node %p with"
5219                              " layout %d\n",
5220                              (void *) node, to_layout_i);
5221           else
5222             dump_printf_loc (MSG_NOTE, vect_location,
5223                              "inserting permutation node in place of %p\n",
5224                              (void *) node);
5225         }
5226
5227       unsigned int num_lanes = SLP_TREE_LANES (node);
5228       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5229       if (SLP_TREE_SCALAR_STMTS (node).length ())
5230         {
5231           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5232           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5233           if (from_layout_i != 0)
5234             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5235           if (to_layout_i != 0)
5236             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5237         }
5238       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5239       SLP_TREE_LANES (result) = num_lanes;
5240       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5241       result->vertex = -1;
5242
5243       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5244       if (tmp_perm.length ())
5245         {
5246           lane_perm.safe_splice (tmp_perm);
5247           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5248         }
5249       else
5250         {
5251           lane_perm.create (num_lanes);
5252           for (unsigned j = 0; j < num_lanes; ++j)
5253             lane_perm.quick_push ({ 0, j });
5254           if (from_layout_i != 0)
5255             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5256           if (to_layout_i != 0)
5257             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5258           SLP_TREE_CHILDREN (result).safe_push (node);
5259         }
5260       for (slp_tree child : SLP_TREE_CHILDREN (result))
5261         child->refcnt++;
5262     }
5263   m_node_layouts[result_i] = result;
5264   return result;
5265 }
5266
5267 /* Apply the chosen vector layouts to the SLP graph.  */
5268
5269 void
5270 vect_optimize_slp_pass::materialize ()
5271 {
5272   /* We no longer need the costs, so avoid having two O(N * P) arrays
5273      live at the same time.  */
5274   m_partition_layout_costs.release ();
5275   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5276
5277   auto_sbitmap fully_folded (m_vertices.length ());
5278   bitmap_clear (fully_folded);
5279   for (unsigned int node_i : m_partitioned_nodes)
5280     {
5281       auto &vertex = m_vertices[node_i];
5282       slp_tree node = vertex.node;
5283       int layout_i = m_partitions[vertex.partition].layout;
5284       gcc_assert (layout_i >= 0);
5285
5286       /* Rearrange the scalar statements to match the chosen layout.  */
5287       if (layout_i > 0)
5288         vect_slp_permute (m_perms[layout_i],
5289                           SLP_TREE_SCALAR_STMTS (node), true);
5290
5291       /* Update load and lane permutations.  */
5292       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5293         {
5294           /* First try to absorb the input vector layouts.  If that fails,
5295              force the inputs to have layout LAYOUT_I too.  We checked that
5296              that was possible before deciding to use nonzero output layouts.
5297              (Note that at this stage we don't really have any guarantee that
5298              the target supports the original VEC_PERM_EXPR.)  */
5299           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5300           auto_lane_permutation_t tmp_perm;
5301           tmp_perm.safe_splice (perm);
5302           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5303           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5304                                               tmp_perm,
5305                                               SLP_TREE_CHILDREN (node),
5306                                               false) >= 0)
5307             {
5308               if (dump_enabled_p ()
5309                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5310                                   perm.begin ()))
5311                 dump_printf_loc (MSG_NOTE, vect_location,
5312                                  "absorbing input layouts into %p\n",
5313                                  (void *) node);
5314               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5315               bitmap_set_bit (fully_folded, node_i);
5316             }
5317           else
5318             {
5319               /* Not MSG_MISSED because it would make no sense to users.  */
5320               if (dump_enabled_p ())
5321                 dump_printf_loc (MSG_NOTE, vect_location,
5322                                  "failed to absorb input layouts into %p\n",
5323                                  (void *) node);
5324               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5325             }
5326         }
5327       else
5328         {
5329           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5330           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5331           if (layout_i > 0)
5332             /* ???  When we handle non-bijective permutes the idea
5333                is that we can force the load-permutation to be
5334                { min, min + 1, min + 2, ... max }.  But then the
5335                scalar defs might no longer match the lane content
5336                which means wrong-code with live lane vectorization.
5337                So we possibly have to have NULL entries for those.  */
5338             vect_slp_permute (m_perms[layout_i], load_perm, true);
5339         }
5340     }
5341
5342   /* Do this before any nodes disappear, since it involves a walk
5343      over the leaves.  */
5344   remove_redundant_permutations ();
5345
5346   /* Replace each child with a correctly laid-out version.  */
5347   for (unsigned int node_i : m_partitioned_nodes)
5348     {
5349       /* Skip nodes that have already been handled above.  */
5350       if (bitmap_bit_p (fully_folded, node_i))
5351         continue;
5352
5353       auto &vertex = m_vertices[node_i];
5354       int in_layout_i = m_partitions[vertex.partition].layout;
5355       gcc_assert (in_layout_i >= 0);
5356
5357       unsigned j;
5358       slp_tree child;
5359       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5360         {
5361           if (!child)
5362             continue;
5363
5364           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5365           if (new_child != child)
5366             {
5367               vect_free_slp_tree (child);
5368               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5369               new_child->refcnt += 1;
5370             }
5371         }
5372     }
5373 }
5374
5375 /* Elide load permutations that are not necessary.  Such permutations might
5376    be pre-existing, rather than created by the layout optimizations.  */
5377
5378 void
5379 vect_optimize_slp_pass::remove_redundant_permutations ()
5380 {
5381   for (unsigned int node_i : m_leafs)
5382     {
5383       slp_tree node = m_vertices[node_i].node;
5384       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5385         continue;
5386
5387       /* In basic block vectorization we allow any subchain of an interleaving
5388          chain.
5389          FORNOW: not in loop SLP because of realignment complications.  */
5390       if (is_a <bb_vec_info> (m_vinfo))
5391         {
5392           bool subchain_p = true;
5393           stmt_vec_info next_load_info = NULL;
5394           stmt_vec_info load_info;
5395           unsigned j;
5396           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5397             {
5398               if (j != 0
5399                   && (next_load_info != load_info
5400                       || DR_GROUP_GAP (load_info) != 1))
5401                 {
5402                   subchain_p = false;
5403                   break;
5404                 }
5405               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5406             }
5407           if (subchain_p)
5408             {
5409               SLP_TREE_LOAD_PERMUTATION (node).release ();
5410               continue;
5411             }
5412         }
5413       else
5414         {
5415           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5416           stmt_vec_info load_info;
5417           bool this_load_permuted = false;
5418           unsigned j;
5419           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5420             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5421               {
5422                 this_load_permuted = true;
5423                 break;
5424               }
5425           stmt_vec_info first_stmt_info
5426             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5427           if (!this_load_permuted
5428               /* The load requires permutation when unrolling exposes
5429                  a gap either because the group is larger than the SLP
5430                  group-size or because there is a gap between the groups.  */
5431               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5432                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5433                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5434             {
5435               SLP_TREE_LOAD_PERMUTATION (node).release ();
5436               continue;
5437             }
5438         }
5439     }
5440 }
5441
5442 /* Print the partition graph and layout information to the dump file.  */
5443
5444 void
5445 vect_optimize_slp_pass::dump ()
5446 {
5447   dump_printf_loc (MSG_NOTE, vect_location,
5448                    "SLP optimize permutations:\n");
5449   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5450     {
5451       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5452       const char *sep = "";
5453       for (unsigned int idx : m_perms[layout_i])
5454         {
5455           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5456           sep = ", ";
5457         }
5458       dump_printf (MSG_NOTE, " }\n");
5459     }
5460   dump_printf_loc (MSG_NOTE, vect_location,
5461                    "SLP optimize partitions:\n");
5462   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5463        ++partition_i)
5464     {
5465       auto &partition = m_partitions[partition_i];
5466       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5467       dump_printf_loc (MSG_NOTE, vect_location,
5468                        "  partition %d (layout %d):\n",
5469                        partition_i, partition.layout);
5470       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5471       for (unsigned int order_i = partition.node_begin;
5472            order_i < partition.node_end; ++order_i)
5473         {
5474           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5475           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5476                            (void *) vertex.node);
5477           dump_printf_loc (MSG_NOTE, vect_location,
5478                            "          weight: %f\n",
5479                            vertex.weight.to_double ());
5480           if (vertex.out_degree)
5481             dump_printf_loc (MSG_NOTE, vect_location,
5482                              "          out weight: %f (degree %d)\n",
5483                              vertex.out_weight.to_double (),
5484                              vertex.out_degree);
5485           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5486             dump_printf_loc (MSG_NOTE, vect_location,
5487                              "          op: VEC_PERM_EXPR\n");
5488           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5489             dump_printf_loc (MSG_NOTE, vect_location,
5490                              "          op template: %G", rep->stmt);
5491         }
5492       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5493       for (unsigned int order_i = partition.node_begin;
5494            order_i < partition.node_end; ++order_i)
5495         {
5496           unsigned int node_i = m_partitioned_nodes[order_i];
5497           auto &vertex = m_vertices[node_i];
5498           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5499             {
5500               auto &other_vertex = m_vertices[other_node_i];
5501               if (other_vertex.partition < vertex.partition)
5502                 dump_printf_loc (MSG_NOTE, vect_location,
5503                                  "      - %p [%d] --> %p\n",
5504                                  (void *) other_vertex.node,
5505                                  other_vertex.partition,
5506                                  (void *) vertex.node);
5507               else
5508                 dump_printf_loc (MSG_NOTE, vect_location,
5509                                  "      - %p --> [%d] %p\n",
5510                                  (void *) vertex.node,
5511                                  other_vertex.partition,
5512                                  (void *) other_vertex.node);
5513             };
5514           for_each_partition_edge (node_i, print_edge);
5515         }
5516
5517       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5518         {
5519           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5520           if (layout_costs.is_possible ())
5521             {
5522               dump_printf_loc (MSG_NOTE, vect_location,
5523                                "    layout %d:%s\n", layout_i,
5524                                partition.layout == int (layout_i)
5525                                ? " (*)" : "");
5526               slpg_layout_cost combined_cost = layout_costs.in_cost;
5527               combined_cost.add_serial_cost (layout_costs.internal_cost);
5528               combined_cost.add_serial_cost (layout_costs.out_cost);
5529 #define TEMPLATE "{depth: %f, total: %f}"
5530               dump_printf_loc (MSG_NOTE, vect_location,
5531                                "        " TEMPLATE "\n",
5532                                layout_costs.in_cost.depth.to_double (),
5533                                layout_costs.in_cost.total.to_double ());
5534               dump_printf_loc (MSG_NOTE, vect_location,
5535                                "      + " TEMPLATE "\n",
5536                                layout_costs.internal_cost.depth.to_double (),
5537                                layout_costs.internal_cost.total.to_double ());
5538               dump_printf_loc (MSG_NOTE, vect_location,
5539                                "      + " TEMPLATE "\n",
5540                                layout_costs.out_cost.depth.to_double (),
5541                                layout_costs.out_cost.total.to_double ());
5542               dump_printf_loc (MSG_NOTE, vect_location,
5543                                "      = " TEMPLATE "\n",
5544                                combined_cost.depth.to_double (),
5545                                combined_cost.total.to_double ());
5546 #undef TEMPLATE
5547             }
5548           else
5549             dump_printf_loc (MSG_NOTE, vect_location,
5550                              "    layout %d: rejected\n", layout_i);
5551         }
5552     }
5553 }
5554
5555 /* Main entry point for the SLP graph optimization pass.  */
5556
5557 void
5558 vect_optimize_slp_pass::run ()
5559 {
5560   build_graph ();
5561   create_partitions ();
5562   start_choosing_layouts ();
5563   if (m_perms.length () > 1)
5564     {
5565       forward_pass ();
5566       backward_pass ();
5567       if (dump_enabled_p ())
5568         dump ();
5569       materialize ();
5570       while (!m_perms.is_empty ())
5571         m_perms.pop ().release ();
5572     }
5573   else
5574     remove_redundant_permutations ();
5575   free_graph (m_slpg);
5576 }
5577
5578 /* Optimize the SLP graph of VINFO.  */
5579
5580 void
5581 vect_optimize_slp (vec_info *vinfo)
5582 {
5583   if (vinfo->slp_instances.is_empty ())
5584     return;
5585   vect_optimize_slp_pass (vinfo).run ();
5586 }
5587
5588 /* Gather loads reachable from the individual SLP graph entries.  */
5589
5590 void
5591 vect_gather_slp_loads (vec_info *vinfo)
5592 {
5593   unsigned i;
5594   slp_instance instance;
5595   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5596     {
5597       hash_set<slp_tree> visited;
5598       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5599                              SLP_INSTANCE_TREE (instance), visited);
5600     }
5601 }
5602
5603
5604 /* For each possible SLP instance decide whether to SLP it and calculate overall
5605    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5606    least one instance.  */
5607
5608 bool
5609 vect_make_slp_decision (loop_vec_info loop_vinfo)
5610 {
5611   unsigned int i;
5612   poly_uint64 unrolling_factor = 1;
5613   const vec<slp_instance> &slp_instances
5614     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5615   slp_instance instance;
5616   int decided_to_slp = 0;
5617
5618   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5619
5620   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5621     {
5622       /* FORNOW: SLP if you can.  */
5623       /* All unroll factors have the form:
5624
5625            GET_MODE_SIZE (vinfo->vector_mode) * X
5626
5627          for some rational X, so they must have a common multiple.  */
5628       unrolling_factor
5629         = force_common_multiple (unrolling_factor,
5630                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5631
5632       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5633          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5634          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5635       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5636       decided_to_slp++;
5637     }
5638
5639   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5640
5641   if (decided_to_slp && dump_enabled_p ())
5642     {
5643       dump_printf_loc (MSG_NOTE, vect_location,
5644                        "Decided to SLP %d instances. Unrolling factor ",
5645                        decided_to_slp);
5646       dump_dec (MSG_NOTE, unrolling_factor);
5647       dump_printf (MSG_NOTE, "\n");
5648     }
5649
5650   return (decided_to_slp > 0);
5651 }
5652
5653 /* Private data for vect_detect_hybrid_slp.  */
5654 struct vdhs_data
5655 {
5656   loop_vec_info loop_vinfo;
5657   vec<stmt_vec_info> *worklist;
5658 };
5659
5660 /* Walker for walk_gimple_op.  */
5661
5662 static tree
5663 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5664 {
5665   walk_stmt_info *wi = (walk_stmt_info *)data;
5666   vdhs_data *dat = (vdhs_data *)wi->info;
5667
5668   if (wi->is_lhs)
5669     return NULL_TREE;
5670
5671   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5672   if (!def_stmt_info)
5673     return NULL_TREE;
5674   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5675   if (PURE_SLP_STMT (def_stmt_info))
5676     {
5677       if (dump_enabled_p ())
5678         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5679                          def_stmt_info->stmt);
5680       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5681       dat->worklist->safe_push (def_stmt_info);
5682     }
5683
5684   return NULL_TREE;
5685 }
5686
5687 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5688    if so, otherwise pushing it to WORKLIST.  */
5689
5690 static void
5691 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5692                                vec<stmt_vec_info> &worklist,
5693                                stmt_vec_info stmt_info)
5694 {
5695   if (dump_enabled_p ())
5696     dump_printf_loc (MSG_NOTE, vect_location,
5697                      "Processing hybrid candidate : %G", stmt_info->stmt);
5698   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5699   imm_use_iterator iter2;
5700   ssa_op_iter iter1;
5701   use_operand_p use_p;
5702   def_operand_p def_p;
5703   bool any_def = false;
5704   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5705     {
5706       any_def = true;
5707       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5708         {
5709           if (is_gimple_debug (USE_STMT (use_p)))
5710             continue;
5711           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5712           /* An out-of loop use means this is a loop_vect sink.  */
5713           if (!use_info)
5714             {
5715               if (dump_enabled_p ())
5716                 dump_printf_loc (MSG_NOTE, vect_location,
5717                                  "Found loop_vect sink: %G", stmt_info->stmt);
5718               worklist.safe_push (stmt_info);
5719               return;
5720             }
5721           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5722             {
5723               if (dump_enabled_p ())
5724                 dump_printf_loc (MSG_NOTE, vect_location,
5725                                  "Found loop_vect use: %G", use_info->stmt);
5726               worklist.safe_push (stmt_info);
5727               return;
5728             }
5729         }
5730     }
5731   /* No def means this is a loo_vect sink.  */
5732   if (!any_def)
5733     {
5734       if (dump_enabled_p ())
5735         dump_printf_loc (MSG_NOTE, vect_location,
5736                          "Found loop_vect sink: %G", stmt_info->stmt);
5737       worklist.safe_push (stmt_info);
5738       return;
5739     }
5740   if (dump_enabled_p ())
5741     dump_printf_loc (MSG_NOTE, vect_location,
5742                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5743   STMT_SLP_TYPE (stmt_info) = pure_slp;
5744 }
5745
5746 /* Find stmts that must be both vectorized and SLPed.  */
5747
5748 void
5749 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5750 {
5751   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5752
5753   /* All stmts participating in SLP are marked pure_slp, all other
5754      stmts are loop_vect.
5755      First collect all loop_vect stmts into a worklist.
5756      SLP patterns cause not all original scalar stmts to appear in
5757      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5758      Rectify this here and do a backward walk over the IL only considering
5759      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5760      mark them as pure_slp.  */
5761   auto_vec<stmt_vec_info> worklist;
5762   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5763     {
5764       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5765       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5766            gsi_next (&gsi))
5767         {
5768           gphi *phi = gsi.phi ();
5769           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5770           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5771             maybe_push_to_hybrid_worklist (loop_vinfo,
5772                                            worklist, stmt_info);
5773         }
5774       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5775            gsi_prev (&gsi))
5776         {
5777           gimple *stmt = gsi_stmt (gsi);
5778           if (is_gimple_debug (stmt))
5779             continue;
5780           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5781           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5782             {
5783               for (gimple_stmt_iterator gsi2
5784                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5785                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5786                 {
5787                   stmt_vec_info patt_info
5788                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5789                   if (!STMT_SLP_TYPE (patt_info)
5790                       && STMT_VINFO_RELEVANT (patt_info))
5791                     maybe_push_to_hybrid_worklist (loop_vinfo,
5792                                                    worklist, patt_info);
5793                 }
5794               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5795             }
5796           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5797             maybe_push_to_hybrid_worklist (loop_vinfo,
5798                                            worklist, stmt_info);
5799         }
5800     }
5801
5802   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5803      mark any SLP vectorized stmt as hybrid.
5804      ???  We're visiting def stmts N times (once for each non-SLP and
5805      once for each hybrid-SLP use).  */
5806   walk_stmt_info wi;
5807   vdhs_data dat;
5808   dat.worklist = &worklist;
5809   dat.loop_vinfo = loop_vinfo;
5810   memset (&wi, 0, sizeof (wi));
5811   wi.info = (void *)&dat;
5812   while (!worklist.is_empty ())
5813     {
5814       stmt_vec_info stmt_info = worklist.pop ();
5815       /* Since SSA operands are not set up for pattern stmts we need
5816          to use walk_gimple_op.  */
5817       wi.is_lhs = 0;
5818       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5819       /* For gather/scatter make sure to walk the offset operand, that
5820          can be a scaling and conversion away.  */
5821       gather_scatter_info gs_info;
5822       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5823           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5824         {
5825           int dummy;
5826           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5827         }
5828     }
5829 }
5830
5831
5832 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5833
5834 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5835   : vec_info (vec_info::bb, shared),
5836     bbs (_bbs),
5837     roots (vNULL)
5838 {
5839   for (unsigned i = 0; i < bbs.length (); ++i)
5840     {
5841       if (i != 0)
5842         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5843              gsi_next (&si))
5844           {
5845             gphi *phi = si.phi ();
5846             gimple_set_uid (phi, 0);
5847             add_stmt (phi);
5848           }
5849       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5850            !gsi_end_p (gsi); gsi_next (&gsi))
5851         {
5852           gimple *stmt = gsi_stmt (gsi);
5853           gimple_set_uid (stmt, 0);
5854           if (is_gimple_debug (stmt))
5855             continue;
5856           add_stmt (stmt);
5857         }
5858     }
5859 }
5860
5861
5862 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5863    stmts in the basic block.  */
5864
5865 _bb_vec_info::~_bb_vec_info ()
5866 {
5867   /* Reset region marker.  */
5868   for (unsigned i = 0; i < bbs.length (); ++i)
5869     {
5870       if (i != 0)
5871         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5872              gsi_next (&si))
5873           {
5874             gphi *phi = si.phi ();
5875             gimple_set_uid (phi, -1);
5876           }
5877       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5878            !gsi_end_p (gsi); gsi_next (&gsi))
5879         {
5880           gimple *stmt = gsi_stmt (gsi);
5881           gimple_set_uid (stmt, -1);
5882         }
5883     }
5884
5885   for (unsigned i = 0; i < roots.length (); ++i)
5886     {
5887       roots[i].stmts.release ();
5888       roots[i].roots.release ();
5889     }
5890   roots.release ();
5891 }
5892
5893 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5894    given then that child nodes have already been processed, and that
5895    their def types currently match their SLP node's def type.  */
5896
5897 static bool
5898 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5899                                     slp_instance node_instance,
5900                                     stmt_vector_for_cost *cost_vec)
5901 {
5902   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5903
5904   /* Calculate the number of vector statements to be created for the
5905      scalar stmts in this node.  For SLP reductions it is equal to the
5906      number of vector statements in the children (which has already been
5907      calculated by the recursive call).  Otherwise it is the number of
5908      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5909      VF divided by the number of elements in a vector.  */
5910   if (!STMT_VINFO_DATA_REF (stmt_info)
5911       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5912     {
5913       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5914         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5915           {
5916             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5917               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5918             break;
5919           }
5920     }
5921   else
5922     {
5923       poly_uint64 vf;
5924       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5925         vf = loop_vinfo->vectorization_factor;
5926       else
5927         vf = 1;
5928       unsigned int group_size = SLP_TREE_LANES (node);
5929       tree vectype = SLP_TREE_VECTYPE (node);
5930       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5931         = vect_get_num_vectors (vf * group_size, vectype);
5932     }
5933
5934   /* Handle purely internal nodes.  */
5935   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5936     {
5937       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5938         return false;
5939
5940       stmt_vec_info slp_stmt_info;
5941       unsigned int i;
5942       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5943         {
5944           if (STMT_VINFO_LIVE_P (slp_stmt_info)
5945               && !vectorizable_live_operation (vinfo,
5946                                                slp_stmt_info, NULL, node,
5947                                                node_instance, i,
5948                                                false, cost_vec))
5949             return false;
5950         }
5951       return true;
5952     }
5953
5954   bool dummy;
5955   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5956                             node, node_instance, cost_vec);
5957 }
5958
5959 /* Try to build NODE from scalars, returning true on success.
5960    NODE_INSTANCE is the SLP instance that contains NODE.  */
5961
5962 static bool
5963 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5964                               slp_instance node_instance)
5965 {
5966   stmt_vec_info stmt_info;
5967   unsigned int i;
5968
5969   if (!is_a <bb_vec_info> (vinfo)
5970       || node == SLP_INSTANCE_TREE (node_instance)
5971       || !SLP_TREE_SCALAR_STMTS (node).exists ()
5972       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5973       /* Force the mask use to be built from scalars instead.  */
5974       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5975     return false;
5976
5977   if (dump_enabled_p ())
5978     dump_printf_loc (MSG_NOTE, vect_location,
5979                      "Building vector operands of %p from scalars instead\n",
5980                      (void *) node);
5981
5982   /* Don't remove and free the child nodes here, since they could be
5983      referenced by other structures.  The analysis and scheduling phases
5984      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
5985   unsigned int group_size = SLP_TREE_LANES (node);
5986   SLP_TREE_DEF_TYPE (node) = vect_external_def;
5987   /* Invariants get their vector type from the uses.  */
5988   SLP_TREE_VECTYPE (node) = NULL_TREE;
5989   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5990   SLP_TREE_LOAD_PERMUTATION (node).release ();
5991   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5992     {
5993       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5994       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5995     }
5996   return true;
5997 }
5998
5999 /* Return true if all elements of the slice are the same.  */
6000 bool
6001 vect_scalar_ops_slice::all_same_p () const
6002 {
6003   for (unsigned int i = 1; i < length; ++i)
6004     if (!operand_equal_p (op (0), op (i)))
6005       return false;
6006   return true;
6007 }
6008
6009 hashval_t
6010 vect_scalar_ops_slice_hash::hash (const value_type &s)
6011 {
6012   hashval_t hash = 0;
6013   for (unsigned i = 0; i < s.length; ++i)
6014     hash = iterative_hash_expr (s.op (i), hash);
6015   return hash;
6016 }
6017
6018 bool
6019 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6020                                    const compare_type &s2)
6021 {
6022   if (s1.length != s2.length)
6023     return false;
6024   for (unsigned i = 0; i < s1.length; ++i)
6025     if (!operand_equal_p (s1.op (i), s2.op (i)))
6026       return false;
6027   return true;
6028 }
6029
6030 /* Compute the prologue cost for invariant or constant operands represented
6031    by NODE.  */
6032
6033 static void
6034 vect_prologue_cost_for_slp (slp_tree node,
6035                             stmt_vector_for_cost *cost_vec)
6036 {
6037   /* There's a special case of an existing vector, that costs nothing.  */
6038   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6039       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6040     return;
6041   /* Without looking at the actual initializer a vector of
6042      constants can be implemented as load from the constant pool.
6043      When all elements are the same we can use a splat.  */
6044   tree vectype = SLP_TREE_VECTYPE (node);
6045   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6046   unsigned HOST_WIDE_INT const_nunits;
6047   unsigned nelt_limit;
6048   auto ops = &SLP_TREE_SCALAR_OPS (node);
6049   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6050   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6051       && ! multiple_p (const_nunits, group_size))
6052     {
6053       nelt_limit = const_nunits;
6054       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6055       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6056         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6057           starts.quick_push (i * const_nunits);
6058     }
6059   else
6060     {
6061       /* If either the vector has variable length or the vectors
6062          are composed of repeated whole groups we only need to
6063          cost construction once.  All vectors will be the same.  */
6064       nelt_limit = group_size;
6065       starts.quick_push (0);
6066     }
6067   /* ???  We're just tracking whether vectors in a single node are the same.
6068      Ideally we'd do something more global.  */
6069   for (unsigned int start : starts)
6070     {
6071       vect_cost_for_stmt kind;
6072       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6073         kind = vector_load;
6074       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6075         kind = scalar_to_vec;
6076       else
6077         kind = vec_construct;
6078       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
6079     }
6080 }
6081
6082 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6083    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6084
6085    Return true if the operations are supported.  */
6086
6087 static bool
6088 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6089                                   slp_instance node_instance,
6090                                   hash_set<slp_tree> &visited_set,
6091                                   vec<slp_tree> &visited_vec,
6092                                   stmt_vector_for_cost *cost_vec)
6093 {
6094   int i, j;
6095   slp_tree child;
6096
6097   /* Assume we can code-generate all invariants.  */
6098   if (!node
6099       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6100       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6101     return true;
6102
6103   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6104     {
6105       if (dump_enabled_p ())
6106         dump_printf_loc (MSG_NOTE, vect_location,
6107                          "Failed cyclic SLP reference in %p\n", (void *) node);
6108       return false;
6109     }
6110   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6111
6112   /* If we already analyzed the exact same set of scalar stmts we're done.
6113      We share the generated vector stmts for those.  */
6114   if (visited_set.add (node))
6115     return true;
6116   visited_vec.safe_push (node);
6117
6118   bool res = true;
6119   unsigned visited_rec_start = visited_vec.length ();
6120   unsigned cost_vec_rec_start = cost_vec->length ();
6121   bool seen_non_constant_child = false;
6122   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6123     {
6124       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6125                                               visited_set, visited_vec,
6126                                               cost_vec);
6127       if (!res)
6128         break;
6129       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6130         seen_non_constant_child = true;
6131     }
6132   /* We're having difficulties scheduling nodes with just constant
6133      operands and no scalar stmts since we then cannot compute a stmt
6134      insertion place.  */
6135   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6136     {
6137       if (dump_enabled_p ())
6138         dump_printf_loc (MSG_NOTE, vect_location,
6139                          "Cannot vectorize all-constant op node %p\n",
6140                          (void *) node);
6141       res = false;
6142     }
6143
6144   if (res)
6145     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6146                                               cost_vec);
6147   /* If analysis failed we have to pop all recursive visited nodes
6148      plus ourselves.  */
6149   if (!res)
6150     {
6151       while (visited_vec.length () >= visited_rec_start)
6152         visited_set.remove (visited_vec.pop ());
6153       cost_vec->truncate (cost_vec_rec_start);
6154     }
6155
6156   /* When the node can be vectorized cost invariant nodes it references.
6157      This is not done in DFS order to allow the refering node
6158      vectorizable_* calls to nail down the invariant nodes vector type
6159      and possibly unshare it if it needs a different vector type than
6160      other referrers.  */
6161   if (res)
6162     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6163       if (child
6164           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6165               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6166           /* Perform usual caching, note code-generation still
6167              code-gens these nodes multiple times but we expect
6168              to CSE them later.  */
6169           && !visited_set.add (child))
6170         {
6171           visited_vec.safe_push (child);
6172           /* ???  After auditing more code paths make a "default"
6173              and push the vector type from NODE to all children
6174              if it is not already set.  */
6175           /* Compute the number of vectors to be generated.  */
6176           tree vector_type = SLP_TREE_VECTYPE (child);
6177           if (!vector_type)
6178             {
6179               /* For shifts with a scalar argument we don't need
6180                  to cost or code-generate anything.
6181                  ???  Represent this more explicitely.  */
6182               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6183                            == shift_vec_info_type)
6184                           && j == 1);
6185               continue;
6186             }
6187           unsigned group_size = SLP_TREE_LANES (child);
6188           poly_uint64 vf = 1;
6189           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6190             vf = loop_vinfo->vectorization_factor;
6191           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6192             = vect_get_num_vectors (vf * group_size, vector_type);
6193           /* And cost them.  */
6194           vect_prologue_cost_for_slp (child, cost_vec);
6195         }
6196
6197   /* If this node or any of its children can't be vectorized, try pruning
6198      the tree here rather than felling the whole thing.  */
6199   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6200     {
6201       /* We'll need to revisit this for invariant costing and number
6202          of vectorized stmt setting.   */
6203       res = true;
6204     }
6205
6206   return res;
6207 }
6208
6209 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6210    region and that can be vectorized using vectorizable_live_operation
6211    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6212    scalar code computing it to be retained.  */
6213
6214 static void
6215 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6216                              slp_instance instance,
6217                              stmt_vector_for_cost *cost_vec,
6218                              hash_set<stmt_vec_info> &svisited,
6219                              hash_set<slp_tree> &visited)
6220 {
6221   if (visited.add (node))
6222     return;
6223
6224   unsigned i;
6225   stmt_vec_info stmt_info;
6226   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6227   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6228     {
6229       if (svisited.contains (stmt_info))
6230         continue;
6231       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6232       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6233           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6234         /* Only the pattern root stmt computes the original scalar value.  */
6235         continue;
6236       bool mark_visited = true;
6237       gimple *orig_stmt = orig_stmt_info->stmt;
6238       ssa_op_iter op_iter;
6239       def_operand_p def_p;
6240       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6241         {
6242           imm_use_iterator use_iter;
6243           gimple *use_stmt;
6244           stmt_vec_info use_stmt_info;
6245           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6246             if (!is_gimple_debug (use_stmt))
6247               {
6248                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6249                 if (!use_stmt_info
6250                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6251                   {
6252                     STMT_VINFO_LIVE_P (stmt_info) = true;
6253                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6254                                                      NULL, node, instance, i,
6255                                                      false, cost_vec))
6256                       /* ???  So we know we can vectorize the live stmt
6257                          from one SLP node.  If we cannot do so from all
6258                          or none consistently we'd have to record which
6259                          SLP node (and lane) we want to use for the live
6260                          operation.  So make sure we can code-generate
6261                          from all nodes.  */
6262                       mark_visited = false;
6263                     else
6264                       STMT_VINFO_LIVE_P (stmt_info) = false;
6265                     break;
6266                   }
6267               }
6268           /* We have to verify whether we can insert the lane extract
6269              before all uses.  The following is a conservative approximation.
6270              We cannot put this into vectorizable_live_operation because
6271              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6272              doesn't work.
6273              Note that while the fact that we emit code for loads at the
6274              first load should make this a non-problem leafs we construct
6275              from scalars are vectorized after the last scalar def.
6276              ???  If we'd actually compute the insert location during
6277              analysis we could use sth less conservative than the last
6278              scalar stmt in the node for the dominance check.  */
6279           /* ???  What remains is "live" uses in vector CTORs in the same
6280              SLP graph which is where those uses can end up code-generated
6281              right after their definition instead of close to their original
6282              use.  But that would restrict us to code-generate lane-extracts
6283              from the latest stmt in a node.  So we compensate for this
6284              during code-generation, simply not replacing uses for those
6285              hopefully rare cases.  */
6286           if (STMT_VINFO_LIVE_P (stmt_info))
6287             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6288               if (!is_gimple_debug (use_stmt)
6289                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6290                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6291                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6292                 {
6293                   if (dump_enabled_p ())
6294                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6295                                      "Cannot determine insertion place for "
6296                                      "lane extract\n");
6297                   STMT_VINFO_LIVE_P (stmt_info) = false;
6298                   mark_visited = true;
6299                 }
6300         }
6301       if (mark_visited)
6302         svisited.add (stmt_info);
6303     }
6304
6305   slp_tree child;
6306   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6307     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6308       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6309                                    cost_vec, svisited, visited);
6310 }
6311
6312 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6313
6314 static bool
6315 vectorizable_bb_reduc_epilogue (slp_instance instance,
6316                                 stmt_vector_for_cost *cost_vec)
6317 {
6318   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6319   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6320   if (reduc_code == MINUS_EXPR)
6321     reduc_code = PLUS_EXPR;
6322   internal_fn reduc_fn;
6323   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6324   if (!vectype
6325       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6326       || reduc_fn == IFN_LAST
6327       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6328       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6329                                      TREE_TYPE (vectype)))
6330     return false;
6331
6332   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6333      cost log2 vector operations plus shuffles and one extraction.  */
6334   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6335   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6336                     vectype, 0, vect_body);
6337   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6338                     vectype, 0, vect_body);
6339   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6340                     vectype, 0, vect_body);
6341   return true;
6342 }
6343
6344 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6345    and recurse to children.  */
6346
6347 static void
6348 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6349                               hash_set<slp_tree> &visited)
6350 {
6351   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6352       || visited.add (node))
6353     return;
6354
6355   stmt_vec_info stmt;
6356   unsigned i;
6357   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6358     roots.remove (vect_orig_stmt (stmt));
6359
6360   slp_tree child;
6361   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6362     if (child)
6363       vect_slp_prune_covered_roots (child, roots, visited);
6364 }
6365
6366 /* Analyze statements in SLP instances of VINFO.  Return true if the
6367    operations are supported. */
6368
6369 bool
6370 vect_slp_analyze_operations (vec_info *vinfo)
6371 {
6372   slp_instance instance;
6373   int i;
6374
6375   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6376
6377   hash_set<slp_tree> visited;
6378   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6379     {
6380       auto_vec<slp_tree> visited_vec;
6381       stmt_vector_for_cost cost_vec;
6382       cost_vec.create (2);
6383       if (is_a <bb_vec_info> (vinfo))
6384         vect_location = instance->location ();
6385       if (!vect_slp_analyze_node_operations (vinfo,
6386                                              SLP_INSTANCE_TREE (instance),
6387                                              instance, visited, visited_vec,
6388                                              &cost_vec)
6389           /* CTOR instances require vectorized defs for the SLP tree root.  */
6390           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6391               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6392                   != vect_internal_def
6393                   /* Make sure we vectorized with the expected type.  */
6394                   || !useless_type_conversion_p
6395                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6396                                               (instance->root_stmts[0]->stmt))),
6397                          TREE_TYPE (SLP_TREE_VECTYPE
6398                                             (SLP_INSTANCE_TREE (instance))))))
6399           /* Check we can vectorize the reduction.  */
6400           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6401               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6402         {
6403           slp_tree node = SLP_INSTANCE_TREE (instance);
6404           stmt_vec_info stmt_info;
6405           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6406             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6407           else
6408             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6409           if (dump_enabled_p ())
6410             dump_printf_loc (MSG_NOTE, vect_location,
6411                              "removing SLP instance operations starting from: %G",
6412                              stmt_info->stmt);
6413           vect_free_slp_instance (instance);
6414           vinfo->slp_instances.ordered_remove (i);
6415           cost_vec.release ();
6416           while (!visited_vec.is_empty ())
6417             visited.remove (visited_vec.pop ());
6418         }
6419       else
6420         {
6421           i++;
6422           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6423             {
6424               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6425               cost_vec.release ();
6426             }
6427           else
6428             /* For BB vectorization remember the SLP graph entry
6429                cost for later.  */
6430             instance->cost_vec = cost_vec;
6431         }
6432     }
6433
6434   /* Now look for SLP instances with a root that are covered by other
6435      instances and remove them.  */
6436   hash_set<stmt_vec_info> roots;
6437   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6438     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6439       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6440   if (!roots.is_empty ())
6441     {
6442       visited.empty ();
6443       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6444         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6445                                       visited);
6446       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6447         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6448             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6449           {
6450             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6451             if (dump_enabled_p ())
6452               dump_printf_loc (MSG_NOTE, vect_location,
6453                                "removing SLP instance operations starting "
6454                                "from: %G", root->stmt);
6455             vect_free_slp_instance (instance);
6456             vinfo->slp_instances.ordered_remove (i);
6457           }
6458         else
6459           ++i;
6460     }
6461
6462   /* Compute vectorizable live stmts.  */
6463   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6464     {
6465       hash_set<stmt_vec_info> svisited;
6466       hash_set<slp_tree> visited;
6467       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6468         {
6469           vect_location = instance->location ();
6470           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6471                                        instance, &instance->cost_vec, svisited,
6472                                        visited);
6473         }
6474     }
6475
6476   return !vinfo->slp_instances.is_empty ();
6477 }
6478
6479 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6480    closing the eventual chain.  */
6481
6482 static slp_instance
6483 get_ultimate_leader (slp_instance instance,
6484                      hash_map<slp_instance, slp_instance> &instance_leader)
6485 {
6486   auto_vec<slp_instance *, 8> chain;
6487   slp_instance *tem;
6488   while (*(tem = instance_leader.get (instance)) != instance)
6489     {
6490       chain.safe_push (tem);
6491       instance = *tem;
6492     }
6493   while (!chain.is_empty ())
6494     *chain.pop () = instance;
6495   return instance;
6496 }
6497
6498 namespace {
6499 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6500    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6501    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6502
6503    INSTANCE_LEADER is as for get_ultimate_leader.  */
6504
6505 template<typename T>
6506 bool
6507 vect_map_to_instance (slp_instance instance, T key,
6508                       hash_map<T, slp_instance> &key_to_instance,
6509                       hash_map<slp_instance, slp_instance> &instance_leader)
6510 {
6511   bool existed_p;
6512   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6513   if (!existed_p)
6514     ;
6515   else if (key_instance != instance)
6516     {
6517       /* If we're running into a previously marked key make us the
6518          leader of the current ultimate leader.  This keeps the
6519          leader chain acyclic and works even when the current instance
6520          connects two previously independent graph parts.  */
6521       slp_instance key_leader
6522         = get_ultimate_leader (key_instance, instance_leader);
6523       if (key_leader != instance)
6524         instance_leader.put (key_leader, instance);
6525     }
6526   key_instance = instance;
6527   return existed_p;
6528 }
6529 }
6530
6531 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6532
6533 static void
6534 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6535                            slp_instance instance, slp_tree node,
6536                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6537                            hash_map<slp_tree, slp_instance> &node_to_instance,
6538                            hash_map<slp_instance, slp_instance> &instance_leader)
6539 {
6540   stmt_vec_info stmt_info;
6541   unsigned i;
6542
6543   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6544     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6545                           instance_leader);
6546
6547   if (vect_map_to_instance (instance, node, node_to_instance,
6548                             instance_leader))
6549     return;
6550
6551   slp_tree child;
6552   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6553     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6554       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6555                                  node_to_instance, instance_leader);
6556 }
6557
6558 /* Partition the SLP graph into pieces that can be costed independently.  */
6559
6560 static void
6561 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6562 {
6563   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6564
6565   /* First walk the SLP graph assigning each involved scalar stmt a
6566      corresponding SLP graph entry and upon visiting a previously
6567      marked stmt, make the stmts leader the current SLP graph entry.  */
6568   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6569   hash_map<slp_tree, slp_instance> node_to_instance;
6570   hash_map<slp_instance, slp_instance> instance_leader;
6571   slp_instance instance;
6572   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6573     {
6574       instance_leader.put (instance, instance);
6575       vect_bb_partition_graph_r (bb_vinfo,
6576                                  instance, SLP_INSTANCE_TREE (instance),
6577                                  stmt_to_instance, node_to_instance,
6578                                  instance_leader);
6579     }
6580
6581   /* Then collect entries to each independent subgraph.  */
6582   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6583     {
6584       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6585       leader->subgraph_entries.safe_push (instance);
6586       if (dump_enabled_p ()
6587           && leader != instance)
6588         dump_printf_loc (MSG_NOTE, vect_location,
6589                          "instance %p is leader of %p\n",
6590                          (void *) leader, (void *) instance);
6591     }
6592 }
6593
6594 /* Compute the set of scalar stmts participating in internal and external
6595    nodes.  */
6596
6597 static void
6598 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6599                                          hash_set<slp_tree> &visited,
6600                                          hash_set<stmt_vec_info> &vstmts,
6601                                          hash_set<stmt_vec_info> &estmts)
6602 {
6603   int i;
6604   stmt_vec_info stmt_info;
6605   slp_tree child;
6606
6607   if (visited.add (node))
6608     return;
6609
6610   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6611     {
6612       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6613         vstmts.add (stmt_info);
6614
6615       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6616         if (child)
6617           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6618                                                    vstmts, estmts);
6619     }
6620   else
6621     for (tree def : SLP_TREE_SCALAR_OPS (node))
6622       {
6623         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6624         if (def_stmt)
6625           estmts.add (def_stmt);
6626       }
6627 }
6628
6629
6630 /* Compute the scalar cost of the SLP node NODE and its children
6631    and return it.  Do not account defs that are marked in LIFE and
6632    update LIFE according to uses of NODE.  */
6633
6634 static void
6635 vect_bb_slp_scalar_cost (vec_info *vinfo,
6636                          slp_tree node, vec<bool, va_heap> *life,
6637                          stmt_vector_for_cost *cost_vec,
6638                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6639                          hash_set<slp_tree> &visited)
6640 {
6641   unsigned i;
6642   stmt_vec_info stmt_info;
6643   slp_tree child;
6644
6645   if (visited.add (node))
6646     return;
6647
6648   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6649     {
6650       ssa_op_iter op_iter;
6651       def_operand_p def_p;
6652
6653       if ((*life)[i])
6654         continue;
6655
6656       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6657       gimple *orig_stmt = orig_stmt_info->stmt;
6658
6659       /* If there is a non-vectorized use of the defs then the scalar
6660          stmt is kept live in which case we do not account it or any
6661          required defs in the SLP children in the scalar cost.  This
6662          way we make the vectorization more costly when compared to
6663          the scalar cost.  */
6664       if (!STMT_VINFO_LIVE_P (stmt_info))
6665         {
6666           auto_vec<gimple *, 8> worklist;
6667           hash_set<gimple *> *worklist_visited = NULL;
6668           worklist.quick_push (orig_stmt);
6669           do
6670             {
6671               gimple *work_stmt = worklist.pop ();
6672               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6673                 {
6674                   imm_use_iterator use_iter;
6675                   gimple *use_stmt;
6676                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6677                                          DEF_FROM_PTR (def_p))
6678                     if (!is_gimple_debug (use_stmt))
6679                       {
6680                         stmt_vec_info use_stmt_info
6681                           = vinfo->lookup_stmt (use_stmt);
6682                         if (!use_stmt_info
6683                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6684                           {
6685                             if (use_stmt_info
6686                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6687                               {
6688                                 /* For stmts participating in patterns we have
6689                                    to check its uses recursively.  */
6690                                 if (!worklist_visited)
6691                                   worklist_visited = new hash_set<gimple *> ();
6692                                 if (!worklist_visited->add (use_stmt))
6693                                   worklist.safe_push (use_stmt);
6694                                 continue;
6695                               }
6696                             (*life)[i] = true;
6697                             goto next_lane;
6698                           }
6699                       }
6700                 }
6701             }
6702           while (!worklist.is_empty ());
6703 next_lane:
6704           if (worklist_visited)
6705             delete worklist_visited;
6706           if ((*life)[i])
6707             continue;
6708         }
6709
6710       /* Count scalar stmts only once.  */
6711       if (gimple_visited_p (orig_stmt))
6712         continue;
6713       gimple_set_visited (orig_stmt, true);
6714
6715       vect_cost_for_stmt kind;
6716       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6717         {
6718           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6719             kind = scalar_load;
6720           else
6721             kind = scalar_store;
6722         }
6723       else if (vect_nop_conversion_p (orig_stmt_info))
6724         continue;
6725       /* For single-argument PHIs assume coalescing which means zero cost
6726          for the scalar and the vector PHIs.  This avoids artificially
6727          favoring the vector path (but may pessimize it in some cases).  */
6728       else if (is_a <gphi *> (orig_stmt_info->stmt)
6729                && gimple_phi_num_args
6730                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6731         continue;
6732       else
6733         kind = scalar_stmt;
6734       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6735                         SLP_TREE_VECTYPE (node), 0, vect_body);
6736     }
6737
6738   auto_vec<bool, 20> subtree_life;
6739   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6740     {
6741       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6742         {
6743           /* Do not directly pass LIFE to the recursive call, copy it to
6744              confine changes in the callee to the current child/subtree.  */
6745           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6746             {
6747               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6748               for (unsigned j = 0;
6749                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6750                 {
6751                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6752                   if (perm.first == i)
6753                     subtree_life[perm.second] = (*life)[j];
6754                 }
6755             }
6756           else
6757             {
6758               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6759               subtree_life.safe_splice (*life);
6760             }
6761           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6762                                    vectorized_scalar_stmts, visited);
6763           subtree_life.truncate (0);
6764         }
6765     }
6766 }
6767
6768 /* Comparator for the loop-index sorted cost vectors.  */
6769
6770 static int
6771 li_cost_vec_cmp (const void *a_, const void *b_)
6772 {
6773   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6774   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6775   if (a->first < b->first)
6776     return -1;
6777   else if (a->first == b->first)
6778     return 0;
6779   return 1;
6780 }
6781
6782 /* Check if vectorization of the basic block is profitable for the
6783    subgraph denoted by SLP_INSTANCES.  */
6784
6785 static bool
6786 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6787                                     vec<slp_instance> slp_instances,
6788                                     loop_p orig_loop)
6789 {
6790   slp_instance instance;
6791   int i;
6792   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6793   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6794
6795   if (dump_enabled_p ())
6796     {
6797       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6798       hash_set<slp_tree> visited;
6799       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6800         vect_print_slp_graph (MSG_NOTE, vect_location,
6801                               SLP_INSTANCE_TREE (instance), visited);
6802     }
6803
6804   /* Compute the set of scalar stmts we know will go away 'locally' when
6805      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6806      not accurate for nodes promoted extern late or for scalar stmts that
6807      are used both in extern defs and in vectorized defs.  */
6808   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6809   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6810   hash_set<slp_tree> visited;
6811   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6812     {
6813       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6814                                                SLP_INSTANCE_TREE (instance),
6815                                                visited,
6816                                                vectorized_scalar_stmts,
6817                                                scalar_stmts_in_externs);
6818       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6819         vectorized_scalar_stmts.add (rstmt);
6820     }
6821   /* Scalar stmts used as defs in external nodes need to be preseved, so
6822      remove them from vectorized_scalar_stmts.  */
6823   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6824     vectorized_scalar_stmts.remove (stmt);
6825
6826   /* Calculate scalar cost and sum the cost for the vector stmts
6827      previously collected.  */
6828   stmt_vector_for_cost scalar_costs = vNULL;
6829   stmt_vector_for_cost vector_costs = vNULL;
6830   visited.empty ();
6831   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6832     {
6833       auto_vec<bool, 20> life;
6834       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6835                               true);
6836       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6837         record_stmt_cost (&scalar_costs,
6838                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6839                           scalar_stmt,
6840                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6841       vect_bb_slp_scalar_cost (bb_vinfo,
6842                                SLP_INSTANCE_TREE (instance),
6843                                &life, &scalar_costs, vectorized_scalar_stmts,
6844                                visited);
6845       vector_costs.safe_splice (instance->cost_vec);
6846       instance->cost_vec.release ();
6847     }
6848
6849   if (dump_enabled_p ())
6850     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6851
6852   /* When costing non-loop vectorization we need to consider each covered
6853      loop independently and make sure vectorization is profitable.  For
6854      now we assume a loop may be not entered or executed an arbitrary
6855      number of iterations (???  static information can provide more
6856      precise info here) which means we can simply cost each containing
6857      loops stmts separately.  */
6858
6859   /* First produce cost vectors sorted by loop index.  */
6860   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6861     li_scalar_costs (scalar_costs.length ());
6862   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6863     li_vector_costs (vector_costs.length ());
6864   stmt_info_for_cost *cost;
6865   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6866     {
6867       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6868       li_scalar_costs.quick_push (std::make_pair (l, cost));
6869     }
6870   /* Use a random used loop as fallback in case the first vector_costs
6871      entry does not have a stmt_info associated with it.  */
6872   unsigned l = li_scalar_costs[0].first;
6873   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6874     {
6875       /* We inherit from the previous COST, invariants, externals and
6876          extracts immediately follow the cost for the related stmt.  */
6877       if (cost->stmt_info)
6878         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6879       li_vector_costs.quick_push (std::make_pair (l, cost));
6880     }
6881   li_scalar_costs.qsort (li_cost_vec_cmp);
6882   li_vector_costs.qsort (li_cost_vec_cmp);
6883
6884   /* Now cost the portions individually.  */
6885   unsigned vi = 0;
6886   unsigned si = 0;
6887   bool profitable = true;
6888   while (si < li_scalar_costs.length ()
6889          && vi < li_vector_costs.length ())
6890     {
6891       unsigned sl = li_scalar_costs[si].first;
6892       unsigned vl = li_vector_costs[vi].first;
6893       if (sl != vl)
6894         {
6895           if (dump_enabled_p ())
6896             dump_printf_loc (MSG_NOTE, vect_location,
6897                              "Scalar %d and vector %d loop part do not "
6898                              "match up, skipping scalar part\n", sl, vl);
6899           /* Skip the scalar part, assuming zero cost on the vector side.  */
6900           do
6901             {
6902               si++;
6903             }
6904           while (si < li_scalar_costs.length ()
6905                  && li_scalar_costs[si].first == sl);
6906           continue;
6907         }
6908
6909       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6910       do
6911         {
6912           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6913           si++;
6914         }
6915       while (si < li_scalar_costs.length ()
6916              && li_scalar_costs[si].first == sl);
6917       unsigned dummy;
6918       finish_cost (scalar_target_cost_data, nullptr,
6919                    &dummy, &scalar_cost, &dummy);
6920
6921       /* Complete the target-specific vector cost calculation.  */
6922       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6923       do
6924         {
6925           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6926           vi++;
6927         }
6928       while (vi < li_vector_costs.length ()
6929              && li_vector_costs[vi].first == vl);
6930       finish_cost (vect_target_cost_data, scalar_target_cost_data,
6931                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6932       delete scalar_target_cost_data;
6933       delete vect_target_cost_data;
6934
6935       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6936
6937       if (dump_enabled_p ())
6938         {
6939           dump_printf_loc (MSG_NOTE, vect_location,
6940                            "Cost model analysis for part in loop %d:\n", sl);
6941           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
6942                        vec_inside_cost + vec_outside_cost);
6943           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
6944         }
6945
6946       /* Vectorization is profitable if its cost is more than the cost of scalar
6947          version.  Note that we err on the vector side for equal cost because
6948          the cost estimate is otherwise quite pessimistic (constant uses are
6949          free on the scalar side but cost a load on the vector side for
6950          example).  */
6951       if (vec_outside_cost + vec_inside_cost > scalar_cost)
6952         {
6953           profitable = false;
6954           break;
6955         }
6956     }
6957   if (profitable && vi < li_vector_costs.length ())
6958     {
6959       if (dump_enabled_p ())
6960         dump_printf_loc (MSG_NOTE, vect_location,
6961                          "Excess vector cost for part in loop %d:\n",
6962                          li_vector_costs[vi].first);
6963       profitable = false;
6964     }
6965
6966   /* Unset visited flag.  This is delayed when the subgraph is profitable
6967      and we process the loop for remaining unvectorized if-converted code.  */
6968   if (!orig_loop || !profitable)
6969     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6970       gimple_set_visited  (cost->stmt_info->stmt, false);
6971
6972   scalar_costs.release ();
6973   vector_costs.release ();
6974
6975   return profitable;
6976 }
6977
6978 /* qsort comparator for lane defs.  */
6979
6980 static int
6981 vld_cmp (const void *a_, const void *b_)
6982 {
6983   auto *a = (const std::pair<unsigned, tree> *)a_;
6984   auto *b = (const std::pair<unsigned, tree> *)b_;
6985   return a->first - b->first;
6986 }
6987
6988 /* Return true if USE_STMT is a vector lane insert into VEC and set
6989    *THIS_LANE to the lane number that is set.  */
6990
6991 static bool
6992 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
6993 {
6994   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
6995   if (!use_ass
6996       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
6997       || (vec
6998           ? gimple_assign_rhs1 (use_ass) != vec
6999           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7000       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7001                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7002       || !constant_multiple_p
7003             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7004              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7005              this_lane))
7006     return false;
7007   return true;
7008 }
7009
7010 /* Find any vectorizable constructors and add them to the grouped_store
7011    array.  */
7012
7013 static void
7014 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
7015 {
7016   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7017     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7018          !gsi_end_p (gsi); gsi_next (&gsi))
7019     {
7020       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7021       if (!assign)
7022         continue;
7023
7024       tree rhs = gimple_assign_rhs1 (assign);
7025       enum tree_code code = gimple_assign_rhs_code (assign);
7026       use_operand_p use_p;
7027       gimple *use_stmt;
7028       if (code == CONSTRUCTOR)
7029         {
7030           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7031               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7032                            CONSTRUCTOR_NELTS (rhs))
7033               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7034               || uniform_vector_p (rhs))
7035             continue;
7036
7037           unsigned j;
7038           tree val;
7039           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7040               if (TREE_CODE (val) != SSA_NAME
7041                   || !bb_vinfo->lookup_def (val))
7042                 break;
7043           if (j != CONSTRUCTOR_NELTS (rhs))
7044             continue;
7045
7046           stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7047           BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7048         }
7049       else if (code == BIT_INSERT_EXPR
7050                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7051                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7052                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7053                && integer_zerop (gimple_assign_rhs3 (assign))
7054                && useless_type_conversion_p
7055                     (TREE_TYPE (TREE_TYPE (rhs)),
7056                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7057                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7058         {
7059           /* We start to match on insert to lane zero but since the
7060              inserts need not be ordered we'd have to search both
7061              the def and the use chains.  */
7062           tree vectype = TREE_TYPE (rhs);
7063           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7064           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7065           auto_sbitmap lanes (nlanes);
7066           bitmap_clear (lanes);
7067           bitmap_set_bit (lanes, 0);
7068           tree def = gimple_assign_lhs (assign);
7069           lane_defs.quick_push
7070                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7071           unsigned lanes_found = 1;
7072           /* Start with the use chains, the last stmt will be the root.  */
7073           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7074           vec<stmt_vec_info> roots = vNULL;
7075           roots.safe_push (last);
7076           do
7077             {
7078               use_operand_p use_p;
7079               gimple *use_stmt;
7080               if (!single_imm_use (def, &use_p, &use_stmt))
7081                 break;
7082               unsigned this_lane;
7083               if (!bb_vinfo->lookup_stmt (use_stmt)
7084                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7085                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7086                 break;
7087               if (bitmap_bit_p (lanes, this_lane))
7088                 break;
7089               lanes_found++;
7090               bitmap_set_bit (lanes, this_lane);
7091               gassign *use_ass = as_a <gassign *> (use_stmt);
7092               lane_defs.quick_push (std::make_pair
7093                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7094               last = bb_vinfo->lookup_stmt (use_ass);
7095               roots.safe_push (last);
7096               def = gimple_assign_lhs (use_ass);
7097             }
7098           while (lanes_found < nlanes);
7099           if (roots.length () > 1)
7100             std::swap(roots[0], roots[roots.length () - 1]);
7101           if (lanes_found < nlanes)
7102             {
7103               /* Now search the def chain.  */
7104               def = gimple_assign_rhs1 (assign);
7105               do
7106                 {
7107                   if (TREE_CODE (def) != SSA_NAME
7108                       || !has_single_use (def))
7109                     break;
7110                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7111                   unsigned this_lane;
7112                   if (!bb_vinfo->lookup_stmt (def_stmt)
7113                       || !vect_slp_is_lane_insert (def_stmt,
7114                                                    NULL_TREE, &this_lane)
7115                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7116                     break;
7117                   if (bitmap_bit_p (lanes, this_lane))
7118                     break;
7119                   lanes_found++;
7120                   bitmap_set_bit (lanes, this_lane);
7121                   lane_defs.quick_push (std::make_pair
7122                                           (this_lane,
7123                                            gimple_assign_rhs2 (def_stmt)));
7124                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7125                   def = gimple_assign_rhs1 (def_stmt);
7126                 }
7127               while (lanes_found < nlanes);
7128             }
7129           if (lanes_found == nlanes)
7130             {
7131               /* Sort lane_defs after the lane index and register the root.  */
7132               lane_defs.qsort (vld_cmp);
7133               vec<stmt_vec_info> stmts;
7134               stmts.create (nlanes);
7135               for (unsigned i = 0; i < nlanes; ++i)
7136                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7137               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7138                                                    stmts, roots));
7139             }
7140           else
7141             roots.release ();
7142         }
7143       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7144                && (associative_tree_code (code) || code == MINUS_EXPR)
7145                /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
7146                   checks pessimize a two-element reduction.  PR54400.
7147                   ???  In-order reduction could be handled if we only
7148                   traverse one operand chain in vect_slp_linearize_chain.  */
7149                && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7150                    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7151                        && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7152                /* Ops with constants at the tail can be stripped here.  */
7153                && TREE_CODE (rhs) == SSA_NAME
7154                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7155                /* Should be the chain end.  */
7156                && (!single_imm_use (gimple_assign_lhs (assign),
7157                                     &use_p, &use_stmt)
7158                    || !is_gimple_assign (use_stmt)
7159                    || (gimple_assign_rhs_code (use_stmt) != code
7160                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7161                            || (gimple_assign_rhs_code (use_stmt)
7162                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7163         {
7164           /* We start the match at the end of a possible association
7165              chain.  */
7166           auto_vec<chain_op_t> chain;
7167           auto_vec<std::pair<tree_code, gimple *> > worklist;
7168           auto_vec<gimple *> chain_stmts;
7169           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7170           if (code == MINUS_EXPR)
7171             code = PLUS_EXPR;
7172           internal_fn reduc_fn;
7173           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7174               || reduc_fn == IFN_LAST)
7175             continue;
7176           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7177                                     /* ??? */
7178                                     code_stmt, alt_code_stmt, &chain_stmts);
7179           if (chain.length () > 1)
7180             {
7181               /* Sort the chain according to def_type and operation.  */
7182               chain.sort (dt_sort_cmp, bb_vinfo);
7183               /* ???  Now we'd want to strip externals and constants
7184                  but record those to be handled in the epilogue.  */
7185               /* ???  For now do not allow mixing ops or externs/constants.  */
7186               bool invalid = false;
7187               for (unsigned i = 0; i < chain.length (); ++i)
7188                 if (chain[i].dt != vect_internal_def
7189                     || chain[i].code != code)
7190                   invalid = true;
7191               if (!invalid)
7192                 {
7193                   vec<stmt_vec_info> stmts;
7194                   stmts.create (chain.length ());
7195                   for (unsigned i = 0; i < chain.length (); ++i)
7196                     stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7197                   vec<stmt_vec_info> roots;
7198                   roots.create (chain_stmts.length ());
7199                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7200                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7201                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7202                                                        stmts, roots));
7203                 }
7204             }
7205         }
7206     }
7207 }
7208
7209 /* Walk the grouped store chains and replace entries with their
7210    pattern variant if any.  */
7211
7212 static void
7213 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7214 {
7215   stmt_vec_info first_element;
7216   unsigned i;
7217
7218   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7219     {
7220       /* We also have CTORs in this array.  */
7221       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7222         continue;
7223       if (STMT_VINFO_IN_PATTERN_P (first_element))
7224         {
7225           stmt_vec_info orig = first_element;
7226           first_element = STMT_VINFO_RELATED_STMT (first_element);
7227           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7228           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7229           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7230           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7231           vinfo->grouped_stores[i] = first_element;
7232         }
7233       stmt_vec_info prev = first_element;
7234       while (DR_GROUP_NEXT_ELEMENT (prev))
7235         {
7236           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7237           if (STMT_VINFO_IN_PATTERN_P (elt))
7238             {
7239               stmt_vec_info orig = elt;
7240               elt = STMT_VINFO_RELATED_STMT (elt);
7241               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7242               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7243               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7244             }
7245           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7246           prev = elt;
7247         }
7248     }
7249 }
7250
7251 /* Check if the region described by BB_VINFO can be vectorized, returning
7252    true if so.  When returning false, set FATAL to true if the same failure
7253    would prevent vectorization at other vector sizes, false if it is still
7254    worth trying other sizes.  N_STMTS is the number of statements in the
7255    region.  */
7256
7257 static bool
7258 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7259                        vec<int> *dataref_groups)
7260 {
7261   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7262
7263   slp_instance instance;
7264   int i;
7265   poly_uint64 min_vf = 2;
7266
7267   /* The first group of checks is independent of the vector size.  */
7268   fatal = true;
7269
7270   /* Analyze the data references.  */
7271
7272   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7273     {
7274       if (dump_enabled_p ())
7275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7276                          "not vectorized: unhandled data-ref in basic "
7277                          "block.\n");
7278       return false;
7279     }
7280
7281   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7282     {
7283      if (dump_enabled_p ())
7284        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7285                         "not vectorized: unhandled data access in "
7286                         "basic block.\n");
7287       return false;
7288     }
7289
7290   vect_slp_check_for_constructors (bb_vinfo);
7291
7292   /* If there are no grouped stores and no constructors in the region
7293      there is no need to continue with pattern recog as vect_analyze_slp
7294      will fail anyway.  */
7295   if (bb_vinfo->grouped_stores.is_empty ()
7296       && bb_vinfo->roots.is_empty ())
7297     {
7298       if (dump_enabled_p ())
7299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7300                          "not vectorized: no grouped stores in "
7301                          "basic block.\n");
7302       return false;
7303     }
7304
7305   /* While the rest of the analysis below depends on it in some way.  */
7306   fatal = false;
7307
7308   vect_pattern_recog (bb_vinfo);
7309
7310   /* Update store groups from pattern processing.  */
7311   vect_fixup_store_groups_with_patterns (bb_vinfo);
7312
7313   /* Check the SLP opportunities in the basic block, analyze and build SLP
7314      trees.  */
7315   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7316     {
7317       if (dump_enabled_p ())
7318         {
7319           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7320                            "Failed to SLP the basic block.\n");
7321           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7322                            "not vectorized: failed to find SLP opportunities "
7323                            "in basic block.\n");
7324         }
7325       return false;
7326     }
7327
7328   /* Optimize permutations.  */
7329   vect_optimize_slp (bb_vinfo);
7330
7331   /* Gather the loads reachable from the SLP graph entries.  */
7332   vect_gather_slp_loads (bb_vinfo);
7333
7334   vect_record_base_alignments (bb_vinfo);
7335
7336   /* Analyze and verify the alignment of data references and the
7337      dependence in the SLP instances.  */
7338   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7339     {
7340       vect_location = instance->location ();
7341       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7342           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7343         {
7344           slp_tree node = SLP_INSTANCE_TREE (instance);
7345           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7346           if (dump_enabled_p ())
7347             dump_printf_loc (MSG_NOTE, vect_location,
7348                              "removing SLP instance operations starting from: %G",
7349                              stmt_info->stmt);
7350           vect_free_slp_instance (instance);
7351           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7352           continue;
7353         }
7354
7355       /* Mark all the statements that we want to vectorize as pure SLP and
7356          relevant.  */
7357       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7358       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7359       unsigned j;
7360       stmt_vec_info root;
7361       /* Likewise consider instance root stmts as vectorized.  */
7362       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7363         STMT_SLP_TYPE (root) = pure_slp;
7364
7365       i++;
7366     }
7367   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7368     return false;
7369
7370   if (!vect_slp_analyze_operations (bb_vinfo))
7371     {
7372       if (dump_enabled_p ())
7373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7374                          "not vectorized: bad operation in basic block.\n");
7375       return false;
7376     }
7377
7378   vect_bb_partition_graph (bb_vinfo);
7379
7380   return true;
7381 }
7382
7383 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7384    basic blocks in BBS, returning true on success.
7385    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7386
7387 static bool
7388 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7389                  vec<int> *dataref_groups, unsigned int n_stmts,
7390                  loop_p orig_loop)
7391 {
7392   bb_vec_info bb_vinfo;
7393   auto_vector_modes vector_modes;
7394
7395   /* Autodetect first vector size we try.  */
7396   machine_mode next_vector_mode = VOIDmode;
7397   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7398   unsigned int mode_i = 0;
7399
7400   vec_info_shared shared;
7401
7402   machine_mode autodetected_vector_mode = VOIDmode;
7403   while (1)
7404     {
7405       bool vectorized = false;
7406       bool fatal = false;
7407       bb_vinfo = new _bb_vec_info (bbs, &shared);
7408
7409       bool first_time_p = shared.datarefs.is_empty ();
7410       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7411       if (first_time_p)
7412         bb_vinfo->shared->save_datarefs ();
7413       else
7414         bb_vinfo->shared->check_datarefs ();
7415       bb_vinfo->vector_mode = next_vector_mode;
7416
7417       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7418         {
7419           if (dump_enabled_p ())
7420             {
7421               dump_printf_loc (MSG_NOTE, vect_location,
7422                                "***** Analysis succeeded with vector mode"
7423                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7424               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7425             }
7426
7427           bb_vinfo->shared->check_datarefs ();
7428
7429           auto_vec<slp_instance> profitable_subgraphs;
7430           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7431             {
7432               if (instance->subgraph_entries.is_empty ())
7433                 continue;
7434
7435               vect_location = instance->location ();
7436               if (!unlimited_cost_model (NULL)
7437                   && !vect_bb_vectorization_profitable_p
7438                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7439                 {
7440                   if (dump_enabled_p ())
7441                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7442                                      "not vectorized: vectorization is not "
7443                                      "profitable.\n");
7444                   continue;
7445                 }
7446
7447               if (!dbg_cnt (vect_slp))
7448                 continue;
7449
7450               profitable_subgraphs.safe_push (instance);
7451             }
7452
7453           /* When we're vectorizing an if-converted loop body make sure
7454              we vectorized all if-converted code.  */
7455           if (!profitable_subgraphs.is_empty ()
7456               && orig_loop)
7457             {
7458               gcc_assert (bb_vinfo->bbs.length () == 1);
7459               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7460                    !gsi_end_p (gsi); gsi_next (&gsi))
7461                 {
7462                   /* The costing above left us with DCEable vectorized scalar
7463                      stmts having the visited flag set on profitable
7464                      subgraphs.  Do the delayed clearing of the flag here.  */
7465                   if (gimple_visited_p (gsi_stmt (gsi)))
7466                     {
7467                       gimple_set_visited (gsi_stmt (gsi), false);
7468                       continue;
7469                     }
7470                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7471                     continue;
7472
7473                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7474                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7475                       {
7476                         if (!profitable_subgraphs.is_empty ()
7477                             && dump_enabled_p ())
7478                           dump_printf_loc (MSG_NOTE, vect_location,
7479                                            "not profitable because of "
7480                                            "unprofitable if-converted scalar "
7481                                            "code\n");
7482                         profitable_subgraphs.truncate (0);
7483                       }
7484                 }
7485             }
7486
7487           /* Finally schedule the profitable subgraphs.  */
7488           for (slp_instance instance : profitable_subgraphs)
7489             {
7490               if (!vectorized && dump_enabled_p ())
7491                 dump_printf_loc (MSG_NOTE, vect_location,
7492                                  "Basic block will be vectorized "
7493                                  "using SLP\n");
7494               vectorized = true;
7495
7496               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7497
7498               unsigned HOST_WIDE_INT bytes;
7499               if (dump_enabled_p ())
7500                 {
7501                   if (GET_MODE_SIZE
7502                         (bb_vinfo->vector_mode).is_constant (&bytes))
7503                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7504                                      "basic block part vectorized using %wu "
7505                                      "byte vectors\n", bytes);
7506                   else
7507                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7508                                      "basic block part vectorized using "
7509                                      "variable length vectors\n");
7510                 }
7511             }
7512         }
7513       else
7514         {
7515           if (dump_enabled_p ())
7516             dump_printf_loc (MSG_NOTE, vect_location,
7517                              "***** Analysis failed with vector mode %s\n",
7518                              GET_MODE_NAME (bb_vinfo->vector_mode));
7519         }
7520
7521       if (mode_i == 0)
7522         autodetected_vector_mode = bb_vinfo->vector_mode;
7523
7524       if (!fatal)
7525         while (mode_i < vector_modes.length ()
7526                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7527           {
7528             if (dump_enabled_p ())
7529               dump_printf_loc (MSG_NOTE, vect_location,
7530                                "***** The result for vector mode %s would"
7531                                " be the same\n",
7532                                GET_MODE_NAME (vector_modes[mode_i]));
7533             mode_i += 1;
7534           }
7535
7536       delete bb_vinfo;
7537
7538       if (mode_i < vector_modes.length ()
7539           && VECTOR_MODE_P (autodetected_vector_mode)
7540           && (related_vector_mode (vector_modes[mode_i],
7541                                    GET_MODE_INNER (autodetected_vector_mode))
7542               == autodetected_vector_mode)
7543           && (related_vector_mode (autodetected_vector_mode,
7544                                    GET_MODE_INNER (vector_modes[mode_i]))
7545               == vector_modes[mode_i]))
7546         {
7547           if (dump_enabled_p ())
7548             dump_printf_loc (MSG_NOTE, vect_location,
7549                              "***** Skipping vector mode %s, which would"
7550                              " repeat the analysis for %s\n",
7551                              GET_MODE_NAME (vector_modes[mode_i]),
7552                              GET_MODE_NAME (autodetected_vector_mode));
7553           mode_i += 1;
7554         }
7555
7556       if (vectorized
7557           || mode_i == vector_modes.length ()
7558           || autodetected_vector_mode == VOIDmode
7559           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7560              vector sizes will fail do not bother iterating.  */
7561           || fatal)
7562         return vectorized;
7563
7564       /* Try the next biggest vector size.  */
7565       next_vector_mode = vector_modes[mode_i++];
7566       if (dump_enabled_p ())
7567         dump_printf_loc (MSG_NOTE, vect_location,
7568                          "***** Re-trying analysis with vector mode %s\n",
7569                          GET_MODE_NAME (next_vector_mode));
7570     }
7571 }
7572
7573
7574 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7575    true if anything in the basic-block was vectorized.  */
7576
7577 static bool
7578 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7579 {
7580   vec<data_reference_p> datarefs = vNULL;
7581   auto_vec<int> dataref_groups;
7582   int insns = 0;
7583   int current_group = 0;
7584
7585   for (unsigned i = 0; i < bbs.length (); i++)
7586     {
7587       basic_block bb = bbs[i];
7588       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7589            gsi_next (&gsi))
7590         {
7591           gimple *stmt = gsi_stmt (gsi);
7592           if (is_gimple_debug (stmt))
7593             continue;
7594
7595           insns++;
7596
7597           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7598             vect_location = stmt;
7599
7600           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7601                                               &dataref_groups, current_group))
7602             ++current_group;
7603         }
7604       /* New BBs always start a new DR group.  */
7605       ++current_group;
7606     }
7607
7608   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7609 }
7610
7611 /* Special entry for the BB vectorizer.  Analyze and transform a single
7612    if-converted BB with ORIG_LOOPs body being the not if-converted
7613    representation.  Returns true if anything in the basic-block was
7614    vectorized.  */
7615
7616 bool
7617 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7618 {
7619   auto_vec<basic_block> bbs;
7620   bbs.safe_push (bb);
7621   return vect_slp_bbs (bbs, orig_loop);
7622 }
7623
7624 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7625    true if anything in the basic-block was vectorized.  */
7626
7627 bool
7628 vect_slp_function (function *fun)
7629 {
7630   bool r = false;
7631   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7632   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7633
7634   /* For the moment split the function into pieces to avoid making
7635      the iteration on the vector mode moot.  Split at points we know
7636      to not handle well which is CFG merges (SLP discovery doesn't
7637      handle non-loop-header PHIs) and loop exits.  Since pattern
7638      recog requires reverse iteration to visit uses before defs
7639      simply chop RPO into pieces.  */
7640   auto_vec<basic_block> bbs;
7641   for (unsigned i = 0; i < n; i++)
7642     {
7643       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7644       bool split = false;
7645
7646       /* Split when a BB is not dominated by the first block.  */
7647       if (!bbs.is_empty ()
7648           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7649         {
7650           if (dump_enabled_p ())
7651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7652                              "splitting region at dominance boundary bb%d\n",
7653                              bb->index);
7654           split = true;
7655         }
7656       /* Split when the loop determined by the first block
7657          is exited.  This is because we eventually insert
7658          invariants at region begin.  */
7659       else if (!bbs.is_empty ()
7660                && bbs[0]->loop_father != bb->loop_father
7661                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7662         {
7663           if (dump_enabled_p ())
7664             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7665                              "splitting region at loop %d exit at bb%d\n",
7666                              bbs[0]->loop_father->num, bb->index);
7667           split = true;
7668         }
7669
7670       if (split && !bbs.is_empty ())
7671         {
7672           r |= vect_slp_bbs (bbs, NULL);
7673           bbs.truncate (0);
7674           bbs.quick_push (bb);
7675         }
7676       else
7677         bbs.safe_push (bb);
7678
7679       /* When we have a stmt ending this block and defining a
7680          value we have to insert on edges when inserting after it for
7681          a vector containing its definition.  Avoid this for now.  */
7682       if (gimple *last = last_stmt (bb))
7683         if (gimple_get_lhs (last)
7684             && is_ctrl_altering_stmt (last))
7685           {
7686             if (dump_enabled_p ())
7687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7688                                "splitting region at control altering "
7689                                "definition %G", last);
7690             r |= vect_slp_bbs (bbs, NULL);
7691             bbs.truncate (0);
7692           }
7693     }
7694
7695   if (!bbs.is_empty ())
7696     r |= vect_slp_bbs (bbs, NULL);
7697
7698   free (rpo);
7699
7700   return r;
7701 }
7702
7703 /* Build a variable-length vector in which the elements in ELTS are repeated
7704    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7705    RESULTS and add any new instructions to SEQ.
7706
7707    The approach we use is:
7708
7709    (1) Find a vector mode VM with integer elements of mode IM.
7710
7711    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7712        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7713        from small vectors to IM.
7714
7715    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7716
7717    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7718        correct byte contents.
7719
7720    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7721
7722    We try to find the largest IM for which this sequence works, in order
7723    to cut down on the number of interleaves.  */
7724
7725 void
7726 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7727                           const vec<tree> &elts, unsigned int nresults,
7728                           vec<tree> &results)
7729 {
7730   unsigned int nelts = elts.length ();
7731   tree element_type = TREE_TYPE (vector_type);
7732
7733   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7734   unsigned int nvectors = 1;
7735   tree new_vector_type;
7736   tree permutes[2];
7737   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7738                                        &nvectors, &new_vector_type,
7739                                        permutes))
7740     gcc_unreachable ();
7741
7742   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7743   unsigned int partial_nelts = nelts / nvectors;
7744   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7745
7746   tree_vector_builder partial_elts;
7747   auto_vec<tree, 32> pieces (nvectors * 2);
7748   pieces.quick_grow_cleared (nvectors * 2);
7749   for (unsigned int i = 0; i < nvectors; ++i)
7750     {
7751       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7752              ELTS' has mode IM.  */
7753       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7754       for (unsigned int j = 0; j < partial_nelts; ++j)
7755         partial_elts.quick_push (elts[i * partial_nelts + j]);
7756       tree t = gimple_build_vector (seq, &partial_elts);
7757       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7758                         TREE_TYPE (new_vector_type), t);
7759
7760       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7761       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7762     }
7763
7764   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7765          correct byte contents.
7766
7767      Conceptually, we need to repeat the following operation log2(nvectors)
7768      times, where hi_start = nvectors / 2:
7769
7770         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7771         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7772
7773      However, if each input repeats every N elements and the VF is
7774      a multiple of N * 2, the HI result is the same as the LO result.
7775      This will be true for the first N1 iterations of the outer loop,
7776      followed by N2 iterations for which both the LO and HI results
7777      are needed.  I.e.:
7778
7779         N1 + N2 = log2(nvectors)
7780
7781      Each "N1 iteration" doubles the number of redundant vectors and the
7782      effect of the process as a whole is to have a sequence of nvectors/2**N1
7783      vectors that repeats 2**N1 times.  Rather than generate these redundant
7784      vectors, we halve the number of vectors for each N1 iteration.  */
7785   unsigned int in_start = 0;
7786   unsigned int out_start = nvectors;
7787   unsigned int new_nvectors = nvectors;
7788   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7789     {
7790       unsigned int hi_start = new_nvectors / 2;
7791       unsigned int out_i = 0;
7792       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7793         {
7794           if ((in_i & 1) != 0
7795               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7796                              2 * in_repeat))
7797             continue;
7798
7799           tree output = make_ssa_name (new_vector_type);
7800           tree input1 = pieces[in_start + (in_i / 2)];
7801           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7802           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7803                                                input1, input2,
7804                                                permutes[in_i & 1]);
7805           gimple_seq_add_stmt (seq, stmt);
7806           pieces[out_start + out_i] = output;
7807           out_i += 1;
7808         }
7809       std::swap (in_start, out_start);
7810       new_nvectors = out_i;
7811     }
7812
7813   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7814   results.reserve (nresults);
7815   for (unsigned int i = 0; i < nresults; ++i)
7816     if (i < new_nvectors)
7817       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7818                                         pieces[in_start + i]));
7819     else
7820       results.quick_push (results[i - new_nvectors]);
7821 }
7822
7823
7824 /* For constant and loop invariant defs in OP_NODE this function creates
7825    vector defs that will be used in the vectorized stmts and stores them
7826    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7827
7828 static void
7829 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7830 {
7831   unsigned HOST_WIDE_INT nunits;
7832   tree vec_cst;
7833   unsigned j, number_of_places_left_in_vector;
7834   tree vector_type;
7835   tree vop;
7836   int group_size = op_node->ops.length ();
7837   unsigned int vec_num, i;
7838   unsigned number_of_copies = 1;
7839   bool constant_p;
7840   gimple_seq ctor_seq = NULL;
7841   auto_vec<tree, 16> permute_results;
7842
7843   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
7844   vector_type = SLP_TREE_VECTYPE (op_node);
7845
7846   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7847   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7848   auto_vec<tree> voprnds (number_of_vectors);
7849
7850   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7851      created vectors. It is greater than 1 if unrolling is performed.
7852
7853      For example, we have two scalar operands, s1 and s2 (e.g., group of
7854      strided accesses of size two), while NUNITS is four (i.e., four scalars
7855      of this type can be packed in a vector).  The output vector will contain
7856      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
7857      will be 2).
7858
7859      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7860      containing the operands.
7861
7862      For example, NUNITS is four as before, and the group size is 8
7863      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
7864      {s5, s6, s7, s8}.  */
7865
7866   /* When using duplicate_and_interleave, we just need one element for
7867      each scalar statement.  */
7868   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7869     nunits = group_size;
7870
7871   number_of_copies = nunits * number_of_vectors / group_size;
7872
7873   number_of_places_left_in_vector = nunits;
7874   constant_p = true;
7875   tree_vector_builder elts (vector_type, nunits, 1);
7876   elts.quick_grow (nunits);
7877   stmt_vec_info insert_after = NULL;
7878   for (j = 0; j < number_of_copies; j++)
7879     {
7880       tree op;
7881       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7882         {
7883           /* Create 'vect_ = {op0,op1,...,opn}'.  */
7884           number_of_places_left_in_vector--;
7885           tree orig_op = op;
7886           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7887             {
7888               if (CONSTANT_CLASS_P (op))
7889                 {
7890                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7891                     {
7892                       /* Can't use VIEW_CONVERT_EXPR for booleans because
7893                          of possibly different sizes of scalar value and
7894                          vector element.  */
7895                       if (integer_zerop (op))
7896                         op = build_int_cst (TREE_TYPE (vector_type), 0);
7897                       else if (integer_onep (op))
7898                         op = build_all_ones_cst (TREE_TYPE (vector_type));
7899                       else
7900                         gcc_unreachable ();
7901                     }
7902                   else
7903                     op = fold_unary (VIEW_CONVERT_EXPR,
7904                                      TREE_TYPE (vector_type), op);
7905                   gcc_assert (op && CONSTANT_CLASS_P (op));
7906                 }
7907               else
7908                 {
7909                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7910                   gimple *init_stmt;
7911                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7912                     {
7913                       tree true_val
7914                         = build_all_ones_cst (TREE_TYPE (vector_type));
7915                       tree false_val
7916                         = build_zero_cst (TREE_TYPE (vector_type));
7917                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7918                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7919                                                        op, true_val,
7920                                                        false_val);
7921                     }
7922                   else
7923                     {
7924                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7925                                    op);
7926                       init_stmt
7927                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7928                                                op);
7929                     }
7930                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
7931                   op = new_temp;
7932                 }
7933             }
7934           elts[number_of_places_left_in_vector] = op;
7935           if (!CONSTANT_CLASS_P (op))
7936             constant_p = false;
7937           /* For BB vectorization we have to compute an insert location
7938              when a def is inside the analyzed region since we cannot
7939              simply insert at the BB start in this case.  */
7940           stmt_vec_info opdef;
7941           if (TREE_CODE (orig_op) == SSA_NAME
7942               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7943               && is_a <bb_vec_info> (vinfo)
7944               && (opdef = vinfo->lookup_def (orig_op)))
7945             {
7946               if (!insert_after)
7947                 insert_after = opdef;
7948               else
7949                 insert_after = get_later_stmt (insert_after, opdef);
7950             }
7951
7952           if (number_of_places_left_in_vector == 0)
7953             {
7954               if (constant_p
7955                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7956                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7957                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7958               else
7959                 {
7960                   if (permute_results.is_empty ())
7961                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7962                                               elts, number_of_vectors,
7963                                               permute_results);
7964                   vec_cst = permute_results[number_of_vectors - j - 1];
7965                 }
7966               if (!gimple_seq_empty_p (ctor_seq))
7967                 {
7968                   if (insert_after)
7969                     {
7970                       gimple_stmt_iterator gsi;
7971                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7972                         {
7973                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7974                           gsi_insert_seq_before (&gsi, ctor_seq,
7975                                                  GSI_CONTINUE_LINKING);
7976                         }
7977                       else if (!stmt_ends_bb_p (insert_after->stmt))
7978                         {
7979                           gsi = gsi_for_stmt (insert_after->stmt);
7980                           gsi_insert_seq_after (&gsi, ctor_seq,
7981                                                 GSI_CONTINUE_LINKING);
7982                         }
7983                       else
7984                         {
7985                           /* When we want to insert after a def where the
7986                              defining stmt throws then insert on the fallthru
7987                              edge.  */
7988                           edge e = find_fallthru_edge
7989                                      (gimple_bb (insert_after->stmt)->succs);
7990                           basic_block new_bb
7991                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
7992                           gcc_assert (!new_bb);
7993                         }
7994                     }
7995                   else
7996                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
7997                   ctor_seq = NULL;
7998                 }
7999               voprnds.quick_push (vec_cst);
8000               insert_after = NULL;
8001               number_of_places_left_in_vector = nunits;
8002               constant_p = true;
8003               elts.new_vector (vector_type, nunits, 1);
8004               elts.quick_grow (nunits);
8005             }
8006         }
8007     }
8008
8009   /* Since the vectors are created in the reverse order, we should invert
8010      them.  */
8011   vec_num = voprnds.length ();
8012   for (j = vec_num; j != 0; j--)
8013     {
8014       vop = voprnds[j - 1];
8015       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8016     }
8017
8018   /* In case that VF is greater than the unrolling factor needed for the SLP
8019      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8020      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8021      to replicate the vectors.  */
8022   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8023     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8024          i++)
8025       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8026 }
8027
8028 /* Get the Ith vectorized definition from SLP_NODE.  */
8029
8030 tree
8031 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8032 {
8033   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8034     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8035   else
8036     return SLP_TREE_VEC_DEFS (slp_node)[i];
8037 }
8038
8039 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8040
8041 void
8042 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8043 {
8044   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8045   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8046     {
8047       unsigned j;
8048       gimple *vec_def_stmt;
8049       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8050         vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8051     }
8052   else
8053     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8054 }
8055
8056 /* Get N vectorized definitions for SLP_NODE.  */
8057
8058 void
8059 vect_get_slp_defs (vec_info *,
8060                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8061 {
8062   if (n == -1U)
8063     n = SLP_TREE_CHILDREN (slp_node).length ();
8064
8065   for (unsigned i = 0; i < n; ++i)
8066     {
8067       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8068       vec<tree> vec_defs = vNULL;
8069       vect_get_slp_defs (child, &vec_defs);
8070       vec_oprnds->quick_push (vec_defs);
8071     }
8072 }
8073
8074 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8075    - PERM gives the permutation that the caller wants to use for NODE,
8076      which might be different from SLP_LOAD_PERMUTATION.
8077    - DUMP_P controls whether the function dumps information.  */
8078
8079 static bool
8080 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8081                                 load_permutation_t &perm,
8082                                 const vec<tree> &dr_chain,
8083                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8084                                 bool analyze_only, bool dump_p,
8085                                 unsigned *n_perms, unsigned int *n_loads,
8086                                 bool dce_chain)
8087 {
8088   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8089   int vec_index = 0;
8090   tree vectype = SLP_TREE_VECTYPE (node);
8091   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8092   unsigned int mask_element;
8093   machine_mode mode;
8094
8095   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8096     return false;
8097
8098   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8099
8100   mode = TYPE_MODE (vectype);
8101   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8102
8103   /* Initialize the vect stmts of NODE to properly insert the generated
8104      stmts later.  */
8105   if (! analyze_only)
8106     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
8107          i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
8108       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8109
8110   /* Generate permutation masks for every NODE. Number of masks for each NODE
8111      is equal to GROUP_SIZE.
8112      E.g., we have a group of three nodes with three loads from the same
8113      location in each node, and the vector size is 4. I.e., we have a
8114      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8115      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8116      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8117      ...
8118
8119      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8120      The last mask is illegal since we assume two operands for permute
8121      operation, and the mask element values can't be outside that range.
8122      Hence, the last mask must be converted into {2,5,5,5}.
8123      For the first two permutations we need the first and the second input
8124      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8125      we need the second and the third vectors: {b1,c1,a2,b2} and
8126      {c2,a3,b3,c3}.  */
8127
8128   int vect_stmts_counter = 0;
8129   unsigned int index = 0;
8130   int first_vec_index = -1;
8131   int second_vec_index = -1;
8132   bool noop_p = true;
8133   *n_perms = 0;
8134
8135   vec_perm_builder mask;
8136   unsigned int nelts_to_build;
8137   unsigned int nvectors_per_build;
8138   unsigned int in_nlanes;
8139   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8140                       && multiple_p (nunits, group_size));
8141   if (repeating_p)
8142     {
8143       /* A single vector contains a whole number of copies of the node, so:
8144          (a) all permutes can use the same mask; and
8145          (b) the permutes only need a single vector input.  */
8146       mask.new_vector (nunits, group_size, 3);
8147       nelts_to_build = mask.encoded_nelts ();
8148       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
8149       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8150     }
8151   else
8152     {
8153       /* We need to construct a separate mask for each vector statement.  */
8154       unsigned HOST_WIDE_INT const_nunits, const_vf;
8155       if (!nunits.is_constant (&const_nunits)
8156           || !vf.is_constant (&const_vf))
8157         return false;
8158       mask.new_vector (const_nunits, const_nunits, 1);
8159       nelts_to_build = const_vf * group_size;
8160       nvectors_per_build = 1;
8161       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8162     }
8163   auto_sbitmap used_in_lanes (in_nlanes);
8164   bitmap_clear (used_in_lanes);
8165   auto_bitmap used_defs;
8166
8167   unsigned int count = mask.encoded_nelts ();
8168   mask.quick_grow (count);
8169   vec_perm_indices indices;
8170
8171   for (unsigned int j = 0; j < nelts_to_build; j++)
8172     {
8173       unsigned int iter_num = j / group_size;
8174       unsigned int stmt_num = j % group_size;
8175       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8176       bitmap_set_bit (used_in_lanes, i);
8177       if (repeating_p)
8178         {
8179           first_vec_index = 0;
8180           mask_element = i;
8181         }
8182       else
8183         {
8184           /* Enforced before the loop when !repeating_p.  */
8185           unsigned int const_nunits = nunits.to_constant ();
8186           vec_index = i / const_nunits;
8187           mask_element = i % const_nunits;
8188           if (vec_index == first_vec_index
8189               || first_vec_index == -1)
8190             {
8191               first_vec_index = vec_index;
8192             }
8193           else if (vec_index == second_vec_index
8194                    || second_vec_index == -1)
8195             {
8196               second_vec_index = vec_index;
8197               mask_element += const_nunits;
8198             }
8199           else
8200             {
8201               if (dump_p)
8202                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8203                                  "permutation requires at "
8204                                  "least three vectors %G",
8205                                  stmt_info->stmt);
8206               gcc_assert (analyze_only);
8207               return false;
8208             }
8209
8210           gcc_assert (mask_element < 2 * const_nunits);
8211         }
8212
8213       if (mask_element != index)
8214         noop_p = false;
8215       mask[index++] = mask_element;
8216
8217       if (index == count && !noop_p)
8218         {
8219           indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8220           if (!can_vec_perm_const_p (mode, mode, indices))
8221             {
8222               if (dump_p)
8223                 {
8224                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8225                                    vect_location,
8226                                    "unsupported vect permute { ");
8227                   for (i = 0; i < count; ++i)
8228                     {
8229                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8230                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8231                     }
8232                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8233                 }
8234               gcc_assert (analyze_only);
8235               return false;
8236             }
8237
8238           ++*n_perms;
8239         }
8240
8241       if (index == count)
8242         {
8243           if (!analyze_only)
8244             {
8245               tree mask_vec = NULL_TREE;
8246
8247               if (! noop_p)
8248                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8249
8250               if (second_vec_index == -1)
8251                 second_vec_index = first_vec_index;
8252
8253               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8254                 {
8255                   /* Generate the permute statement if necessary.  */
8256                   tree first_vec = dr_chain[first_vec_index + ri];
8257                   tree second_vec = dr_chain[second_vec_index + ri];
8258                   gimple *perm_stmt;
8259                   if (! noop_p)
8260                     {
8261                       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
8262                       tree perm_dest
8263                         = vect_create_destination_var (gimple_assign_lhs (stmt),
8264                                                        vectype);
8265                       perm_dest = make_ssa_name (perm_dest);
8266                       perm_stmt
8267                         = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8268                                                first_vec, second_vec,
8269                                                mask_vec);
8270                       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8271                                                    gsi);
8272                       if (dce_chain)
8273                         {
8274                           bitmap_set_bit (used_defs, first_vec_index + ri);
8275                           bitmap_set_bit (used_defs, second_vec_index + ri);
8276                         }
8277                     }
8278                   else
8279                     {
8280                       /* If mask was NULL_TREE generate the requested
8281                          identity transform.  */
8282                       perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8283                       if (dce_chain)
8284                         bitmap_set_bit (used_defs, first_vec_index + ri);
8285                     }
8286
8287                   /* Store the vector statement in NODE.  */
8288                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8289                 }
8290             }
8291
8292           index = 0;
8293           first_vec_index = -1;
8294           second_vec_index = -1;
8295           noop_p = true;
8296         }
8297     }
8298
8299   if (n_loads)
8300     {
8301       if (repeating_p)
8302         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8303       else
8304         {
8305           /* Enforced above when !repeating_p.  */
8306           unsigned int const_nunits = nunits.to_constant ();
8307           *n_loads = 0;
8308           bool load_seen = false;
8309           for (unsigned i = 0; i < in_nlanes; ++i)
8310             {
8311               if (i % const_nunits == 0)
8312                 {
8313                   if (load_seen)
8314                     *n_loads += 1;
8315                   load_seen = false;
8316                 }
8317               if (bitmap_bit_p (used_in_lanes, i))
8318                 load_seen = true;
8319             }
8320           if (load_seen)
8321             *n_loads += 1;
8322         }
8323     }
8324
8325   if (dce_chain)
8326     for (unsigned i = 0; i < dr_chain.length (); ++i)
8327       if (!bitmap_bit_p (used_defs, i))
8328         {
8329           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8330           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8331           gsi_remove (&rgsi, true);
8332           release_defs (stmt);
8333         }
8334
8335   return true;
8336 }
8337
8338 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8339    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8340    permute statements for the SLP node NODE.  Store the number of vector
8341    permute instructions in *N_PERMS and the number of vector load
8342    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8343    that were not needed.  */
8344
8345 bool
8346 vect_transform_slp_perm_load (vec_info *vinfo,
8347                               slp_tree node, const vec<tree> &dr_chain,
8348                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8349                               bool analyze_only, unsigned *n_perms,
8350                               unsigned int *n_loads, bool dce_chain)
8351 {
8352   return vect_transform_slp_perm_load_1 (vinfo, node,
8353                                          SLP_TREE_LOAD_PERMUTATION (node),
8354                                          dr_chain, gsi, vf, analyze_only,
8355                                          dump_enabled_p (), n_perms, n_loads,
8356                                          dce_chain);
8357 }
8358
8359 /* Produce the next vector result for SLP permutation NODE by adding a vector
8360    statement at GSI.  If MASK_VEC is nonnull, add:
8361
8362       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8363
8364    otherwise add:
8365
8366       <new SSA name> = FIRST_DEF.  */
8367
8368 static void
8369 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8370                           slp_tree node, tree first_def, tree second_def,
8371                           tree mask_vec)
8372 {
8373   tree vectype = SLP_TREE_VECTYPE (node);
8374
8375   /* ???  We SLP match existing vector element extracts but
8376      allow punning which we need to re-instantiate at uses
8377      but have no good way of explicitly representing.  */
8378   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8379       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8380     {
8381       gassign *conv_stmt
8382         = gimple_build_assign (make_ssa_name (vectype),
8383                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8384       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8385       first_def = gimple_assign_lhs (conv_stmt);
8386     }
8387   gassign *perm_stmt;
8388   tree perm_dest = make_ssa_name (vectype);
8389   if (mask_vec)
8390     {
8391       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8392                            TYPE_SIZE (vectype))
8393           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8394         {
8395           gassign *conv_stmt
8396             = gimple_build_assign (make_ssa_name (vectype),
8397                                    build1 (VIEW_CONVERT_EXPR,
8398                                            vectype, second_def));
8399           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8400           second_def = gimple_assign_lhs (conv_stmt);
8401         }
8402       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8403                                        first_def, second_def,
8404                                        mask_vec);
8405     }
8406   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8407     {
8408       /* For identity permutes we still need to handle the case
8409          of lowpart extracts or concats.  */
8410       unsigned HOST_WIDE_INT c;
8411       auto first_def_nunits
8412         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8413       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8414         {
8415           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8416                                  TYPE_SIZE (vectype), bitsize_zero_node);
8417           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8418         }
8419       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8420                                     first_def_nunits, &c) && c == 2)
8421         {
8422           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8423                                             NULL_TREE, second_def);
8424           perm_stmt = gimple_build_assign (perm_dest, ctor);
8425         }
8426       else
8427         gcc_unreachable ();
8428     }
8429   else
8430     {
8431       /* We need a copy here in case the def was external.  */
8432       perm_stmt = gimple_build_assign (perm_dest, first_def);
8433     }
8434   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8435   /* Store the vector statement in NODE.  */
8436   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8437 }
8438
8439 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8440    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8441    If GSI is nonnull, emit the permutation there.
8442
8443    When GSI is null, the only purpose of NODE is to give properties
8444    of the result, such as the vector type and number of SLP lanes.
8445    The node does not need to be a VEC_PERM_EXPR.
8446
8447    If the target supports the operation, return the number of individual
8448    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8449    dump file if DUMP_P is true.  */
8450
8451 static int
8452 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8453                                 slp_tree node, lane_permutation_t &perm,
8454                                 vec<slp_tree> &children, bool dump_p)
8455 {
8456   tree vectype = SLP_TREE_VECTYPE (node);
8457
8458   /* ???  We currently only support all same vector input types
8459      while the SLP IL should really do a concat + select and thus accept
8460      arbitrary mismatches.  */
8461   slp_tree child;
8462   unsigned i;
8463   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8464   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8465   tree op_vectype = NULL_TREE;
8466   FOR_EACH_VEC_ELT (children, i, child)
8467     if (SLP_TREE_VECTYPE (child))
8468       {
8469         op_vectype = SLP_TREE_VECTYPE (child);
8470         break;
8471       }
8472   if (!op_vectype)
8473     op_vectype = vectype;
8474   FOR_EACH_VEC_ELT (children, i, child)
8475     {
8476       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8477            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8478           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8479           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8480         {
8481           if (dump_p)
8482             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8483                              "Unsupported vector types in lane permutation\n");
8484           return -1;
8485         }
8486       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8487         repeating_p = false;
8488     }
8489
8490   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8491   if (dump_p)
8492     {
8493       dump_printf_loc (MSG_NOTE, vect_location,
8494                        "vectorizing permutation");
8495       for (unsigned i = 0; i < perm.length (); ++i)
8496         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8497       if (repeating_p)
8498         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8499       dump_printf (MSG_NOTE, "\n");
8500     }
8501
8502   /* REPEATING_P is true if every output vector is guaranteed to use the
8503      same permute vector.  We can handle that case for both variable-length
8504      and constant-length vectors, but we only handle other cases for
8505      constant-length vectors.
8506
8507      Set:
8508
8509      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8510        mask vector that we want to build.
8511
8512      - NCOPIES to the number of copies of PERM that we need in order
8513        to build the necessary permute mask vectors.
8514
8515      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8516        for each permute mask vector.  This is only relevant when GSI is
8517        nonnull.  */
8518   uint64_t npatterns;
8519   unsigned nelts_per_pattern;
8520   uint64_t ncopies;
8521   unsigned noutputs_per_mask;
8522   if (repeating_p)
8523     {
8524       /* We need a single permute mask vector that has the form:
8525
8526            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8527
8528          In other words, the original n-element permute in PERM is
8529          "unrolled" to fill a full vector.  The stepped vector encoding
8530          that we use for permutes requires 3n elements.  */
8531       npatterns = SLP_TREE_LANES (node);
8532       nelts_per_pattern = ncopies = 3;
8533       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8534     }
8535   else
8536     {
8537       /* Calculate every element of every permute mask vector explicitly,
8538          instead of relying on the pattern described above.  */
8539       if (!nunits.is_constant (&npatterns))
8540         return -1;
8541       nelts_per_pattern = ncopies = 1;
8542       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8543         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8544           return -1;
8545       noutputs_per_mask = 1;
8546     }
8547   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8548   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8549
8550   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8551      from the { SLP operand, scalar lane } permutation as recorded in the
8552      SLP node as intermediate step.  This part should already work
8553      with SLP children with arbitrary number of lanes.  */
8554   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8555   auto_vec<unsigned> active_lane;
8556   vperm.create (olanes);
8557   active_lane.safe_grow_cleared (children.length (), true);
8558   for (unsigned i = 0; i < ncopies; ++i)
8559     {
8560       for (unsigned pi = 0; pi < perm.length (); ++pi)
8561         {
8562           std::pair<unsigned, unsigned> p = perm[pi];
8563           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8564           if (repeating_p)
8565             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8566           else
8567             {
8568               /* We checked above that the vectors are constant-length.  */
8569               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8570               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8571               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8572               vperm.quick_push ({{p.first, vi}, vl});
8573             }
8574         }
8575       /* Advance to the next group.  */
8576       for (unsigned j = 0; j < children.length (); ++j)
8577         active_lane[j] += SLP_TREE_LANES (children[j]);
8578     }
8579
8580   if (dump_p)
8581     {
8582       dump_printf_loc (MSG_NOTE, vect_location,
8583                        "vectorizing permutation");
8584       for (unsigned i = 0; i < perm.length (); ++i)
8585         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8586       if (repeating_p)
8587         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8588       dump_printf (MSG_NOTE, "\n");
8589       dump_printf_loc (MSG_NOTE, vect_location, "as");
8590       for (unsigned i = 0; i < vperm.length (); ++i)
8591         {
8592           if (i != 0
8593               && (repeating_p
8594                   ? multiple_p (i, npatterns)
8595                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8596             dump_printf (MSG_NOTE, ",");
8597           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8598                        vperm[i].first.first, vperm[i].first.second,
8599                        vperm[i].second);
8600         }
8601       dump_printf (MSG_NOTE, "\n");
8602     }
8603
8604   /* We can only handle two-vector permutes, everything else should
8605      be lowered on the SLP level.  The following is closely inspired
8606      by vect_transform_slp_perm_load and is supposed to eventually
8607      replace it.
8608      ???   As intermediate step do code-gen in the SLP tree representation
8609      somehow?  */
8610   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8611   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8612   unsigned int index = 0;
8613   poly_uint64 mask_element;
8614   vec_perm_builder mask;
8615   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8616   unsigned int count = mask.encoded_nelts ();
8617   mask.quick_grow (count);
8618   vec_perm_indices indices;
8619   unsigned nperms = 0;
8620   for (unsigned i = 0; i < vperm.length (); ++i)
8621     {
8622       mask_element = vperm[i].second;
8623       if (first_vec.first == -1U
8624           || first_vec == vperm[i].first)
8625         first_vec = vperm[i].first;
8626       else if (second_vec.first == -1U
8627                || second_vec == vperm[i].first)
8628         {
8629           second_vec = vperm[i].first;
8630           mask_element += nunits;
8631         }
8632       else
8633         {
8634           if (dump_p)
8635             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8636                              "permutation requires at "
8637                              "least three vectors\n");
8638           gcc_assert (!gsi);
8639           return -1;
8640         }
8641
8642       mask[index++] = mask_element;
8643
8644       if (index == count)
8645         {
8646           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8647                               TYPE_VECTOR_SUBPARTS (op_vectype));
8648           bool identity_p = indices.series_p (0, 1, 0, 1);
8649           machine_mode vmode = TYPE_MODE (vectype);
8650           machine_mode op_vmode = TYPE_MODE (op_vectype);
8651           unsigned HOST_WIDE_INT c;
8652           if ((!identity_p
8653                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8654               || (identity_p
8655                   && !known_le (nunits,
8656                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8657                   && (!constant_multiple_p (nunits,
8658                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8659                                             &c) || c != 2)))
8660             {
8661               if (dump_p)
8662                 {
8663                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8664                                    vect_location,
8665                                    "unsupported vect permute { ");
8666                   for (i = 0; i < count; ++i)
8667                     {
8668                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8669                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8670                     }
8671                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8672                 }
8673               gcc_assert (!gsi);
8674               return -1;
8675             }
8676
8677           if (!identity_p)
8678             nperms++;
8679           if (gsi)
8680             {
8681               if (second_vec.first == -1U)
8682                 second_vec = first_vec;
8683
8684               slp_tree
8685                 first_node = children[first_vec.first],
8686                 second_node = children[second_vec.first];
8687
8688               tree mask_vec = NULL_TREE;
8689               if (!identity_p)
8690                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8691
8692               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8693                 {
8694                   tree first_def
8695                     = vect_get_slp_vect_def (first_node,
8696                                              first_vec.second + vi);
8697                   tree second_def
8698                     = vect_get_slp_vect_def (second_node,
8699                                              second_vec.second + vi);
8700                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8701                                             second_def, mask_vec);
8702                 }
8703             }
8704
8705           index = 0;
8706           first_vec = std::make_pair (-1U, -1U);
8707           second_vec = std::make_pair (-1U, -1U);
8708         }
8709     }
8710
8711   return nperms;
8712 }
8713
8714 /* Vectorize the SLP permutations in NODE as specified
8715    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8716    child number and lane number.
8717    Interleaving of two two-lane two-child SLP subtrees (not supported):
8718      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8719    A blend of two four-lane two-child SLP subtrees:
8720      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8721    Highpart of a four-lane one-child SLP subtree (not supported):
8722      [ { 0, 2 }, { 0, 3 } ]
8723    Where currently only a subset is supported by code generating below.  */
8724
8725 static bool
8726 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8727                               slp_tree node, stmt_vector_for_cost *cost_vec)
8728 {
8729   tree vectype = SLP_TREE_VECTYPE (node);
8730   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8731   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8732                                                SLP_TREE_CHILDREN (node),
8733                                                dump_enabled_p ());
8734   if (nperms < 0)
8735     return false;
8736
8737   if (!gsi)
8738     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8739
8740   return true;
8741 }
8742
8743 /* Vectorize SLP NODE.  */
8744
8745 static void
8746 vect_schedule_slp_node (vec_info *vinfo,
8747                         slp_tree node, slp_instance instance)
8748 {
8749   gimple_stmt_iterator si;
8750   int i;
8751   slp_tree child;
8752
8753   /* For existing vectors there's nothing to do.  */
8754   if (SLP_TREE_VEC_DEFS (node).exists ())
8755     return;
8756
8757   gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8758
8759   /* Vectorize externals and constants.  */
8760   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8761       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8762     {
8763       /* ???  vectorizable_shift can end up using a scalar operand which is
8764          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8765          node in this case.  */
8766       if (!SLP_TREE_VECTYPE (node))
8767         return;
8768
8769       vect_create_constant_vectors (vinfo, node);
8770       return;
8771     }
8772
8773   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8774
8775   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8776   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8777
8778   if (dump_enabled_p ())
8779     dump_printf_loc (MSG_NOTE, vect_location,
8780                      "------>vectorizing SLP node starting from: %G",
8781                      stmt_info->stmt);
8782
8783   if (STMT_VINFO_DATA_REF (stmt_info)
8784       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8785     {
8786       /* Vectorized loads go before the first scalar load to make it
8787          ready early, vectorized stores go before the last scalar
8788          stmt which is where all uses are ready.  */
8789       stmt_vec_info last_stmt_info = NULL;
8790       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8791         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8792       else /* DR_IS_WRITE */
8793         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8794       si = gsi_for_stmt (last_stmt_info->stmt);
8795     }
8796   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8797             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8798             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8799            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8800     {
8801       /* For PHI node vectorization we do not use the insertion iterator.  */
8802       si = gsi_none ();
8803     }
8804   else
8805     {
8806       /* Emit other stmts after the children vectorized defs which is
8807          earliest possible.  */
8808       gimple *last_stmt = NULL;
8809       bool seen_vector_def = false;
8810       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8811         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8812           {
8813             /* For fold-left reductions we are retaining the scalar
8814                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8815                set so the representation isn't perfect.  Resort to the
8816                last scalar def here.  */
8817             if (SLP_TREE_VEC_STMTS (child).is_empty ())
8818               {
8819                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8820                             == cycle_phi_info_type);
8821                 gphi *phi = as_a <gphi *>
8822                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8823                 if (!last_stmt
8824                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8825                   last_stmt = phi;
8826               }
8827             /* We are emitting all vectorized stmts in the same place and
8828                the last one is the last.
8829                ???  Unless we have a load permutation applied and that
8830                figures to re-use an earlier generated load.  */
8831             unsigned j;
8832             gimple *vstmt;
8833             FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8834               if (!last_stmt
8835                   || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8836                 last_stmt = vstmt;
8837           }
8838         else if (!SLP_TREE_VECTYPE (child))
8839           {
8840             /* For externals we use unvectorized at all scalar defs.  */
8841             unsigned j;
8842             tree def;
8843             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8844               if (TREE_CODE (def) == SSA_NAME
8845                   && !SSA_NAME_IS_DEFAULT_DEF (def))
8846                 {
8847                   gimple *stmt = SSA_NAME_DEF_STMT (def);
8848                   if (!last_stmt
8849                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8850                     last_stmt = stmt;
8851                 }
8852           }
8853         else
8854           {
8855             /* For externals we have to look at all defs since their
8856                insertion place is decided per vector.  But beware
8857                of pre-existing vectors where we need to make sure
8858                we do not insert before the region boundary.  */
8859             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8860                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8861               seen_vector_def = true;
8862             else
8863               {
8864                 unsigned j;
8865                 tree vdef;
8866                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8867                   if (TREE_CODE (vdef) == SSA_NAME
8868                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8869                     {
8870                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8871                       if (!last_stmt
8872                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8873                         last_stmt = vstmt;
8874                     }
8875               }
8876           }
8877       /* This can happen when all children are pre-existing vectors or
8878          constants.  */
8879       if (!last_stmt)
8880         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8881       if (!last_stmt)
8882         {
8883           gcc_assert (seen_vector_def);
8884           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8885         }
8886       else if (is_ctrl_altering_stmt (last_stmt))
8887         {
8888           /* We split regions to vectorize at control altering stmts
8889              with a definition so this must be an external which
8890              we can insert at the start of the region.  */
8891           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8892         }
8893       else if (is_a <bb_vec_info> (vinfo)
8894                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8895                && gimple_could_trap_p (stmt_info->stmt))
8896         {
8897           /* We've constrained possibly trapping operations to all come
8898              from the same basic-block, if vectorized defs would allow earlier
8899              scheduling still force vectorized stmts to the original block.
8900              This is only necessary for BB vectorization since for loop vect
8901              all operations are in a single BB and scalar stmt based
8902              placement doesn't play well with epilogue vectorization.  */
8903           gcc_assert (dominated_by_p (CDI_DOMINATORS,
8904                                       gimple_bb (stmt_info->stmt),
8905                                       gimple_bb (last_stmt)));
8906           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8907         }
8908       else if (is_a <gphi *> (last_stmt))
8909         si = gsi_after_labels (gimple_bb (last_stmt));
8910       else
8911         {
8912           si = gsi_for_stmt (last_stmt);
8913           gsi_next (&si);
8914         }
8915     }
8916
8917   /* Handle purely internal nodes.  */
8918   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8919     {
8920       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
8921          be shared with different SLP nodes (but usually it's the same
8922          operation apart from the case the stmt is only there for denoting
8923          the actual scalar lane defs ...).  So do not call vect_transform_stmt
8924          but open-code it here (partly).  */
8925       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8926       gcc_assert (done);
8927       stmt_vec_info slp_stmt_info;
8928       unsigned int i;
8929       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8930         if (STMT_VINFO_LIVE_P (slp_stmt_info))
8931           {
8932             done = vectorizable_live_operation (vinfo,
8933                                                 slp_stmt_info, &si, node,
8934                                                 instance, i, true, NULL);
8935             gcc_assert (done);
8936           }
8937     }
8938   else
8939     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8940 }
8941
8942 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8943    For loop vectorization this is done in vectorizable_call, but for SLP
8944    it needs to be deferred until end of vect_schedule_slp, because multiple
8945    SLP instances may refer to the same scalar stmt.  */
8946
8947 static void
8948 vect_remove_slp_scalar_calls (vec_info *vinfo,
8949                               slp_tree node, hash_set<slp_tree> &visited)
8950 {
8951   gimple *new_stmt;
8952   gimple_stmt_iterator gsi;
8953   int i;
8954   slp_tree child;
8955   tree lhs;
8956   stmt_vec_info stmt_info;
8957
8958   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8959     return;
8960
8961   if (visited.add (node))
8962     return;
8963
8964   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8965     vect_remove_slp_scalar_calls (vinfo, child, visited);
8966
8967   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8968     {
8969       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8970       if (!stmt || gimple_bb (stmt) == NULL)
8971         continue;
8972       if (is_pattern_stmt_p (stmt_info)
8973           || !PURE_SLP_STMT (stmt_info))
8974         continue;
8975       lhs = gimple_call_lhs (stmt);
8976       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
8977       gsi = gsi_for_stmt (stmt);
8978       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
8979       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
8980     }
8981 }
8982
8983 static void
8984 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
8985 {
8986   hash_set<slp_tree> visited;
8987   vect_remove_slp_scalar_calls (vinfo, node, visited);
8988 }
8989
8990 /* Vectorize the instance root.  */
8991
8992 void
8993 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
8994 {
8995   gassign *rstmt = NULL;
8996
8997   if (instance->kind == slp_inst_kind_ctor)
8998     {
8999       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9000         {
9001           gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
9002           tree vect_lhs = gimple_get_lhs (child_stmt);
9003           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9004           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9005                                           TREE_TYPE (vect_lhs)))
9006             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9007                                vect_lhs);
9008           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9009         }
9010       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9011         {
9012           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9013           gimple *child_stmt;
9014           int j;
9015           vec<constructor_elt, va_gc> *v;
9016           vec_alloc (v, nelts);
9017
9018           /* A CTOR can handle V16HI composition from VNx8HI so we
9019              do not need to convert vector elements if the types
9020              do not match.  */
9021           FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
9022             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
9023                                     gimple_get_lhs (child_stmt));
9024           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9025           tree rtype
9026             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9027           tree r_constructor = build_constructor (rtype, v);
9028           rstmt = gimple_build_assign (lhs, r_constructor);
9029         }
9030     }
9031   else if (instance->kind == slp_inst_kind_bb_reduc)
9032     {
9033       /* Largely inspired by reduction chain epilogue handling in
9034          vect_create_epilog_for_reduction.  */
9035       vec<tree> vec_defs = vNULL;
9036       vect_get_slp_defs (node, &vec_defs);
9037       enum tree_code reduc_code
9038         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9039       /* ???  We actually have to reflect signs somewhere.  */
9040       if (reduc_code == MINUS_EXPR)
9041         reduc_code = PLUS_EXPR;
9042       gimple_seq epilogue = NULL;
9043       /* We may end up with more than one vector result, reduce them
9044          to one vector.  */
9045       tree vec_def = vec_defs[0];
9046       for (unsigned i = 1; i < vec_defs.length (); ++i)
9047         vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9048                                 vec_def, vec_defs[i]);
9049       vec_defs.release ();
9050       /* ???  Support other schemes than direct internal fn.  */
9051       internal_fn reduc_fn;
9052       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9053           || reduc_fn == IFN_LAST)
9054         gcc_unreachable ();
9055       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9056                                       TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9057
9058       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9059       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9060       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9061       update_stmt (gsi_stmt (rgsi));
9062       return;
9063     }
9064   else
9065     gcc_unreachable ();
9066
9067   gcc_assert (rstmt);
9068
9069   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9070   gsi_replace (&rgsi, rstmt, true);
9071 }
9072
9073 struct slp_scc_info
9074 {
9075   bool on_stack;
9076   int dfs;
9077   int lowlink;
9078 };
9079
9080 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9081
9082 static void
9083 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9084                    hash_map<slp_tree, slp_scc_info> &scc_info,
9085                    int &maxdfs, vec<slp_tree> &stack)
9086 {
9087   bool existed_p;
9088   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9089   gcc_assert (!existed_p);
9090   info->dfs = maxdfs;
9091   info->lowlink = maxdfs;
9092   maxdfs++;
9093
9094   /* Leaf.  */
9095   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9096     {
9097       info->on_stack = false;
9098       vect_schedule_slp_node (vinfo, node, instance);
9099       return;
9100     }
9101
9102   info->on_stack = true;
9103   stack.safe_push (node);
9104
9105   unsigned i;
9106   slp_tree child;
9107   /* DFS recurse.  */
9108   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9109     {
9110       if (!child)
9111         continue;
9112       slp_scc_info *child_info = scc_info.get (child);
9113       if (!child_info)
9114         {
9115           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9116           /* Recursion might have re-allocated the node.  */
9117           info = scc_info.get (node);
9118           child_info = scc_info.get (child);
9119           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9120         }
9121       else if (child_info->on_stack)
9122         info->lowlink = MIN (info->lowlink, child_info->dfs);
9123     }
9124   if (info->lowlink != info->dfs)
9125     return;
9126
9127   auto_vec<slp_tree, 4> phis_to_fixup;
9128
9129   /* Singleton.  */
9130   if (stack.last () == node)
9131     {
9132       stack.pop ();
9133       info->on_stack = false;
9134       vect_schedule_slp_node (vinfo, node, instance);
9135       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9136           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9137         phis_to_fixup.quick_push (node);
9138     }
9139   else
9140     {
9141       /* SCC.  */
9142       int last_idx = stack.length () - 1;
9143       while (stack[last_idx] != node)
9144         last_idx--;
9145       /* We can break the cycle at PHIs who have at least one child
9146          code generated.  Then we could re-start the DFS walk until
9147          all nodes in the SCC are covered (we might have new entries
9148          for only back-reachable nodes).  But it's simpler to just
9149          iterate and schedule those that are ready.  */
9150       unsigned todo = stack.length () - last_idx;
9151       do
9152         {
9153           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9154             {
9155               slp_tree entry = stack[idx];
9156               if (!entry)
9157                 continue;
9158               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9159                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9160               bool ready = !phi;
9161               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9162                   if (!child)
9163                     {
9164                       gcc_assert (phi);
9165                       ready = true;
9166                       break;
9167                     }
9168                   else if (scc_info.get (child)->on_stack)
9169                     {
9170                       if (!phi)
9171                         {
9172                           ready = false;
9173                           break;
9174                         }
9175                     }
9176                   else
9177                     {
9178                       if (phi)
9179                         {
9180                           ready = true;
9181                           break;
9182                         }
9183                     }
9184               if (ready)
9185                 {
9186                   vect_schedule_slp_node (vinfo, entry, instance);
9187                   scc_info.get (entry)->on_stack = false;
9188                   stack[idx] = NULL;
9189                   todo--;
9190                   if (phi)
9191                     phis_to_fixup.safe_push (entry);
9192                 }
9193             }
9194         }
9195       while (todo != 0);
9196
9197       /* Pop the SCC.  */
9198       stack.truncate (last_idx);
9199     }
9200
9201   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9202   slp_tree phi_node;
9203   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9204     {
9205       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9206       edge_iterator ei;
9207       edge e;
9208       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9209         {
9210           unsigned dest_idx = e->dest_idx;
9211           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9212           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9213             continue;
9214           unsigned n = SLP_TREE_VEC_STMTS (phi_node).length ();
9215           /* Simply fill all args.  */
9216           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9217               != vect_first_order_recurrence)
9218             for (unsigned i = 0; i < n; ++i)
9219               add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9220                            vect_get_slp_vect_def (child, i),
9221                            e, gimple_phi_arg_location (phi, dest_idx));
9222           else
9223             {
9224               /* Unless it is a first order recurrence which needs
9225                  args filled in for both the PHI node and the permutes.  */
9226               gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[0];
9227               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9228               add_phi_arg (as_a <gphi *> (rphi),
9229                            vect_get_slp_vect_def (child, n - 1),
9230                            e, gimple_phi_arg_location (phi, dest_idx));
9231               for (unsigned i = 0; i < n; ++i)
9232                 {
9233                   gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[i];
9234                   if (i > 0)
9235                     gimple_assign_set_rhs1 (perm,
9236                                             vect_get_slp_vect_def (child, i - 1));
9237                   gimple_assign_set_rhs2 (perm,
9238                                           vect_get_slp_vect_def (child, i));
9239                   update_stmt (perm);
9240                 }
9241             }
9242         }
9243     }
9244 }
9245
9246 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9247
9248 void
9249 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9250 {
9251   slp_instance instance;
9252   unsigned int i;
9253
9254   hash_map<slp_tree, slp_scc_info> scc_info;
9255   int maxdfs = 0;
9256   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9257     {
9258       slp_tree node = SLP_INSTANCE_TREE (instance);
9259       if (dump_enabled_p ())
9260         {
9261           dump_printf_loc (MSG_NOTE, vect_location,
9262                            "Vectorizing SLP tree:\n");
9263           /* ???  Dump all?  */
9264           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9265             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9266                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9267           vect_print_slp_graph (MSG_NOTE, vect_location,
9268                                 SLP_INSTANCE_TREE (instance));
9269         }
9270       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9271          have a PHI be the node breaking the cycle.  */
9272       auto_vec<slp_tree> stack;
9273       if (!scc_info.get (node))
9274         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9275
9276       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9277         vectorize_slp_instance_root_stmt (node, instance);
9278
9279       if (dump_enabled_p ())
9280         dump_printf_loc (MSG_NOTE, vect_location,
9281                          "vectorizing stmts using SLP.\n");
9282     }
9283
9284   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9285     {
9286       slp_tree root = SLP_INSTANCE_TREE (instance);
9287       stmt_vec_info store_info;
9288       unsigned int j;
9289
9290       /* Remove scalar call stmts.  Do not do this for basic-block
9291          vectorization as not all uses may be vectorized.
9292          ???  Why should this be necessary?  DCE should be able to
9293          remove the stmts itself.
9294          ???  For BB vectorization we can as well remove scalar
9295          stmts starting from the SLP tree root if they have no
9296          uses.  */
9297       if (is_a <loop_vec_info> (vinfo))
9298         vect_remove_slp_scalar_calls (vinfo, root);
9299
9300       /* Remove vectorized stores original scalar stmts.  */
9301       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9302         {
9303           if (!STMT_VINFO_DATA_REF (store_info)
9304               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9305             break;
9306
9307           store_info = vect_orig_stmt (store_info);
9308           /* Free the attached stmt_vec_info and remove the stmt.  */
9309           vinfo->remove_stmt (store_info);
9310
9311           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9312              to not crash in vect_free_slp_tree later.  */
9313           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9314             SLP_TREE_REPRESENTATIVE (root) = NULL;
9315         }
9316     }
9317 }