gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_STMTS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_STMTS (this).release ();
 145   SLP_TREE_VEC_DEFS (this).release ();
 146   SLP_TREE_LOAD_PERMUTATION (this).release ();
 147   SLP_TREE_LANE_PERMUTATION (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 153
 154 void
 155 vect_free_slp_tree (slp_tree node)
 156 {
 157   int i;
 158   slp_tree child;
 159
 160   if (--SLP_TREE_REF_COUNT (node) != 0)
 161     return;
 162
 163   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 164     if (child)
 165       vect_free_slp_tree (child);
 166
 167   /* If the node defines any SLP only patterns then those patterns are no
 168      longer valid and should be removed.  */
 169   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 170   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 171     {
 172       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 173       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 174       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 175     }
 176
 177   delete node;
 178 }
 179
 180 /* Return a location suitable for dumpings related to the SLP instance.  */
 181
 182 dump_user_location_t
 183 _slp_instance::location () const
 184 {
 185   if (!root_stmts.is_empty ())
 186     return root_stmts[0]->stmt;
 187   else
 188     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 189 }
 190
 191
 192 /* Free the memory allocated for the SLP instance.  */
 193
 194 void
 195 vect_free_slp_instance (slp_instance instance)
 196 {
 197   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 198   SLP_INSTANCE_LOADS (instance).release ();
 199   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 200   instance->subgraph_entries.release ();
 201   instance->cost_vec.release ();
 202   free (instance);
 203 }
 204
 205
 206 /* Create an SLP node for SCALAR_STMTS.  */
 207
 208 slp_tree
 209 vect_create_new_slp_node (unsigned nops, tree_code code)
 210 {
 211   slp_tree node = new _slp_tree;
 212   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 213   SLP_TREE_CHILDREN (node).create (nops);
 214   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 215   SLP_TREE_CODE (node) = code;
 216   return node;
 217 }
 218 /* Create an SLP node for SCALAR_STMTS.  */
 219
 220 static slp_tree
 221 vect_create_new_slp_node (slp_tree node,
 222                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 223 {
 224   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 225   SLP_TREE_CHILDREN (node).create (nops);
 226   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 227   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 228   SLP_TREE_LANES (node) = scalar_stmts.length ();
 229   return node;
 230 }
 231
 232 /* Create an SLP node for SCALAR_STMTS.  */
 233
 234 static slp_tree
 235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 238 }
 239
 240 /* Create an SLP node for OPS.  */
 241
 242 static slp_tree
 243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 244 {
 245   SLP_TREE_SCALAR_OPS (node) = ops;
 246   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 247   SLP_TREE_LANES (node) = ops.length ();
 248   return node;
 249 }
 250
 251 /* Create an SLP node for OPS.  */
 252
 253 static slp_tree
 254 vect_create_new_slp_node (vec<tree> ops)
 255 {
 256   return vect_create_new_slp_node (new _slp_tree, ops);
 257 }
 258
 259
 260 /* This structure is used in creation of an SLP tree.  Each instance
 261    corresponds to the same operand in a group of scalar stmts in an SLP
 262    node.  */
 263 typedef struct _slp_oprnd_info
 264 {
 265   /* Def-stmts for the operands.  */
 266   vec<stmt_vec_info> def_stmts;
 267   /* Operands.  */
 268   vec<tree> ops;
 269   /* Information about the first statement, its vector def-type, type, the
 270      operand itself in case it's constant, and an indication if it's a pattern
 271      stmt.  */
 272   tree first_op_type;
 273   enum vect_def_type first_dt;
 274   bool any_pattern;
 275 } *slp_oprnd_info;
 276
 277
 278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 279    operand.  */
 280 static vec<slp_oprnd_info>
 281 vect_create_oprnd_info (int nops, int group_size)
 282 {
 283   int i;
 284   slp_oprnd_info oprnd_info;
 285   vec<slp_oprnd_info> oprnds_info;
 286
 287   oprnds_info.create (nops);
 288   for (i = 0; i < nops; i++)
 289     {
 290       oprnd_info = XNEW (struct _slp_oprnd_info);
 291       oprnd_info->def_stmts.create (group_size);
 292       oprnd_info->ops.create (group_size);
 293       oprnd_info->first_dt = vect_uninitialized_def;
 294       oprnd_info->first_op_type = NULL_TREE;
 295       oprnd_info->any_pattern = false;
 296       oprnds_info.quick_push (oprnd_info);
 297     }
 298
 299   return oprnds_info;
 300 }
 301
 302
 303 /* Free operands info.  */
 304
 305 static void
 306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 307 {
 308   int i;
 309   slp_oprnd_info oprnd_info;
 310
 311   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 312     {
 313       oprnd_info->def_stmts.release ();
 314       oprnd_info->ops.release ();
 315       XDELETE (oprnd_info);
 316     }
 317
 318   oprnds_info.release ();
 319 }
 320
 321 /* Return the execution frequency of NODE (so that a higher value indicates
 322    a "more important" node when optimizing for speed).  */
 323
 324 static sreal
 325 vect_slp_node_weight (slp_tree node)
 326 {
 327   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 328   basic_block bb = gimple_bb (stmt_info->stmt);
 329   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 330 }
 331
 332 /* Return true if STMTS contains a pattern statement.  */
 333
 334 static bool
 335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 336 {
 337   stmt_vec_info stmt_info;
 338   unsigned int i;
 339   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 340     if (is_pattern_stmt_p (stmt_info))
 341       return true;
 342   return false;
 343 }
 344
 345 /* Return true when all lanes in the external or constant NODE have
 346    the same value.  */
 347
 348 static bool
 349 vect_slp_tree_uniform_p (slp_tree node)
 350 {
 351   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 352               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 353
 354   /* Pre-exsting vectors.  */
 355   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 356     return false;
 357
 358   unsigned i;
 359   tree op, first = NULL_TREE;
 360   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 361     if (!first)
 362       first = op;
 363     else if (!operand_equal_p (first, op, 0))
 364       return false;
 365
 366   return true;
 367 }
 368
 369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 370    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 371    of the chain.  */
 372
 373 int
 374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 375                                       stmt_vec_info first_stmt_info)
 376 {
 377   stmt_vec_info next_stmt_info = first_stmt_info;
 378   int result = 0;
 379
 380   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 381     return -1;
 382
 383   do
 384     {
 385       if (next_stmt_info == stmt_info)
 386         return result;
 387       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 388       if (next_stmt_info)
 389         result += DR_GROUP_GAP (next_stmt_info);
 390     }
 391   while (next_stmt_info);
 392
 393   return -1;
 394 }
 395
 396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 397    using the method implemented by duplicate_and_interleave.  Return true
 398    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 399    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 400    (if nonnull).  */
 401
 402 bool
 403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 404                                 tree elt_type, unsigned int *nvectors_out,
 405                                 tree *vector_type_out,
 406                                 tree *permutes)
 407 {
 408   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 409   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 410     return false;
 411
 412   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 413   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 414   unsigned int nvectors = 1;
 415   for (;;)
 416     {
 417       scalar_int_mode int_mode;
 418       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 419       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 420         {
 421           /* Get the natural vector type for this SLP group size.  */
 422           tree int_type = build_nonstandard_integer_type
 423             (GET_MODE_BITSIZE (int_mode), 1);
 424           tree vector_type
 425             = get_vectype_for_scalar_type (vinfo, int_type, count);
 426           poly_int64 half_nelts;
 427           if (vector_type
 428               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 429               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 430                            GET_MODE_SIZE (base_vector_mode))
 431               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 432                              2, &half_nelts))
 433             {
 434               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 435                  together into elements of type INT_TYPE and using the result
 436                  to build NVECTORS vectors.  */
 437               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 438               vec_perm_builder sel1 (nelts, 2, 3);
 439               vec_perm_builder sel2 (nelts, 2, 3);
 440
 441               for (unsigned int i = 0; i < 3; ++i)
 442                 {
 443                   sel1.quick_push (i);
 444                   sel1.quick_push (i + nelts);
 445                   sel2.quick_push (half_nelts + i);
 446                   sel2.quick_push (half_nelts + i + nelts);
 447                 }
 448               vec_perm_indices indices1 (sel1, 2, nelts);
 449               vec_perm_indices indices2 (sel2, 2, nelts);
 450               machine_mode vmode = TYPE_MODE (vector_type);
 451               if (can_vec_perm_const_p (vmode, vmode, indices1)
 452                   && can_vec_perm_const_p (vmode, vmode, indices2))
 453                 {
 454                   if (nvectors_out)
 455                     *nvectors_out = nvectors;
 456                   if (vector_type_out)
 457                     *vector_type_out = vector_type;
 458                   if (permutes)
 459                     {
 460                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 461                                                                 indices1);
 462                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 463                                                                 indices2);
 464                     }
 465                   return true;
 466                 }
 467             }
 468         }
 469       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 470         return false;
 471       nvectors *= 2;
 472     }
 473 }
 474
 475 /* Return true if DTA and DTB match.  */
 476
 477 static bool
 478 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 479 {
 480   return (dta == dtb
 481           || ((dta == vect_external_def || dta == vect_constant_def)
 482               && (dtb == vect_external_def || dtb == vect_constant_def)));
 483 }
 484
 485 static const int cond_expr_maps[3][5] = {
 486   { 4, -1, -2, 1, 2 },
 487   { 4, -2, -1, 1, 2 },
 488   { 4, -1, -2, 2, 1 }
 489 };
 490 static const int arg1_map[] = { 1, 1 };
 491 static const int arg2_map[] = { 1, 2 };
 492 static const int arg1_arg4_map[] = { 2, 1, 4 };
 493 static const int op1_op0_map[] = { 2, 1, 0 };
 494
 495 /* For most SLP statements, there is a one-to-one mapping between
 496    gimple arguments and child nodes.  If that is not true for STMT,
 497    return an array that contains:
 498
 499    - the number of child nodes, followed by
 500    - for each child node, the index of the argument associated with that node.
 501      The special index -1 is the first operand of an embedded comparison and
 502      the special index -2 is the second operand of an embedded comparison.
 503
 504    SWAP is as for vect_get_and_check_slp_defs.  */
 505
 506 static const int *
 507 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 508 {
 509   if (auto assign = dyn_cast<const gassign *> (stmt))
 510     {
 511       if (gimple_assign_rhs_code (assign) == COND_EXPR
 512           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 513         return cond_expr_maps[swap];
 514       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 515           && swap)
 516         return op1_op0_map;
 517     }
 518   gcc_assert (!swap);
 519   if (auto call = dyn_cast<const gcall *> (stmt))
 520     {
 521       if (gimple_call_internal_p (call))
 522         switch (gimple_call_internal_fn (call))
 523           {
 524           case IFN_MASK_LOAD:
 525             return arg2_map;
 526
 527           case IFN_GATHER_LOAD:
 528             return arg1_map;
 529
 530           case IFN_MASK_GATHER_LOAD:
 531             return arg1_arg4_map;
 532
 533           default:
 534             break;
 535           }
 536     }
 537   return nullptr;
 538 }
 539
 540 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 541    they are of a valid type and that they match the defs of the first stmt of
 542    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 543    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 544    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 545    is 1 if STMT is cond and operands of comparison need to be swapped;
 546    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 547
 548    If there was a fatal error return -1; if the error could be corrected by
 549    swapping operands of father node of this one, return 1; if everything is
 550    ok return 0.  */
 551 static int
 552 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 553                              bool *skip_args,
 554                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 555                              vec<slp_oprnd_info> *oprnds_info)
 556 {
 557   stmt_vec_info stmt_info = stmts[stmt_num];
 558   tree oprnd;
 559   unsigned int i, number_of_oprnds;
 560   enum vect_def_type dt = vect_uninitialized_def;
 561   slp_oprnd_info oprnd_info;
 562   unsigned int commutative_op = -1U;
 563   bool first = stmt_num == 0;
 564
 565   if (!is_a<gcall *> (stmt_info->stmt)
 566       && !is_a<gassign *> (stmt_info->stmt)
 567       && !is_a<gphi *> (stmt_info->stmt))
 568     return -1;
 569
 570   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 571   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 572   if (map)
 573     number_of_oprnds = *map++;
 574   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 575     {
 576       if (gimple_call_internal_p (stmt))
 577         {
 578           internal_fn ifn = gimple_call_internal_fn (stmt);
 579           commutative_op = first_commutative_argument (ifn);
 580         }
 581     }
 582   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 583     {
 584       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 585         commutative_op = 0;
 586     }
 587
 588   bool swapped = (swap != 0);
 589   bool backedge = false;
 590   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 591   for (i = 0; i < number_of_oprnds; i++)
 592     {
 593       int opno = map ? map[i] : int (i);
 594       if (opno < 0)
 595         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 596       else
 597         {
 598           oprnd = gimple_arg (stmt_info->stmt, opno);
 599           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 600             backedge = dominated_by_p (CDI_DOMINATORS,
 601                                        gimple_phi_arg_edge (stmt, opno)->src,
 602                                        gimple_bb (stmt_info->stmt));
 603         }
 604       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 605         oprnd = TREE_OPERAND (oprnd, 0);
 606
 607       oprnd_info = (*oprnds_info)[i];
 608
 609       stmt_vec_info def_stmt_info;
 610       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 611         {
 612           if (dump_enabled_p ())
 613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 614                              "Build SLP failed: can't analyze def for %T\n",
 615                              oprnd);
 616
 617           return -1;
 618         }
 619
 620       if (skip_args[i])
 621         {
 622           oprnd_info->def_stmts.quick_push (NULL);
 623           oprnd_info->ops.quick_push (NULL_TREE);
 624           oprnd_info->first_dt = vect_uninitialized_def;
 625           continue;
 626         }
 627
 628       oprnd_info->def_stmts.quick_push (def_stmt_info);
 629       oprnd_info->ops.quick_push (oprnd);
 630
 631       if (def_stmt_info
 632           && is_pattern_stmt_p (def_stmt_info))
 633         {
 634           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 635               != def_stmt_info)
 636             oprnd_info->any_pattern = true;
 637           else
 638             /* If we promote this to external use the original stmt def.  */
 639             oprnd_info->ops.last ()
 640               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 641         }
 642
 643       /* If there's a extern def on a backedge make sure we can
 644          code-generate at the region start.
 645          ???  This is another case that could be fixed by adjusting
 646          how we split the function but at the moment we'd have conflicting
 647          goals there.  */
 648       if (backedge
 649           && dts[i] == vect_external_def
 650           && is_a <bb_vec_info> (vinfo)
 651           && TREE_CODE (oprnd) == SSA_NAME
 652           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 653           && !dominated_by_p (CDI_DOMINATORS,
 654                               as_a <bb_vec_info> (vinfo)->bbs[0],
 655                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 659                              "Build SLP failed: extern def %T only defined "
 660                              "on backedge\n", oprnd);
 661           return -1;
 662         }
 663
 664       if (first)
 665         {
 666           tree type = TREE_TYPE (oprnd);
 667           dt = dts[i];
 668           if ((dt == vect_constant_def
 669                || dt == vect_external_def)
 670               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 671               && (TREE_CODE (type) == BOOLEAN_TYPE
 672                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 673                                                       type)))
 674             {
 675               if (dump_enabled_p ())
 676                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 677                                  "Build SLP failed: invalid type of def "
 678                                  "for variable-length SLP %T\n", oprnd);
 679               return -1;
 680             }
 681
 682           /* For the swapping logic below force vect_reduction_def
 683              for the reduction op in a SLP reduction group.  */
 684           if (!STMT_VINFO_DATA_REF (stmt_info)
 685               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 686               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 687               && def_stmt_info)
 688             dts[i] = dt = vect_reduction_def;
 689
 690           /* Check the types of the definition.  */
 691           switch (dt)
 692             {
 693             case vect_external_def:
 694             case vect_constant_def:
 695             case vect_internal_def:
 696             case vect_reduction_def:
 697             case vect_induction_def:
 698             case vect_nested_cycle:
 699             case vect_first_order_recurrence:
 700               break;
 701
 702             default:
 703               /* FORNOW: Not supported.  */
 704               if (dump_enabled_p ())
 705                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 706                                  "Build SLP failed: illegal type of def %T\n",
 707                                  oprnd);
 708               return -1;
 709             }
 710
 711           oprnd_info->first_dt = dt;
 712           oprnd_info->first_op_type = type;
 713         }
 714     }
 715   if (first)
 716     return 0;
 717
 718   /* Now match the operand definition types to that of the first stmt.  */
 719   for (i = 0; i < number_of_oprnds;)
 720     {
 721       if (skip_args[i])
 722         {
 723           ++i;
 724           continue;
 725         }
 726
 727       oprnd_info = (*oprnds_info)[i];
 728       dt = dts[i];
 729       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 730       oprnd = oprnd_info->ops[stmt_num];
 731       tree type = TREE_TYPE (oprnd);
 732
 733       if (!types_compatible_p (oprnd_info->first_op_type, type))
 734         {
 735           if (dump_enabled_p ())
 736             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 737                              "Build SLP failed: different operand types\n");
 738           return 1;
 739         }
 740
 741       /* Not first stmt of the group, check that the def-stmt/s match
 742          the def-stmt/s of the first stmt.  Allow different definition
 743          types for reduction chains: the first stmt must be a
 744          vect_reduction_def (a phi node), and the rest
 745          end in the reduction chain.  */
 746       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 747            && !(oprnd_info->first_dt == vect_reduction_def
 748                 && !STMT_VINFO_DATA_REF (stmt_info)
 749                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 750                 && def_stmt_info
 751                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 752                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 753                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 754           || (!STMT_VINFO_DATA_REF (stmt_info)
 755               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 756               && ((!def_stmt_info
 757                    || STMT_VINFO_DATA_REF (def_stmt_info)
 758                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 759                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 760                   != (oprnd_info->first_dt != vect_reduction_def))))
 761         {
 762           /* Try swapping operands if we got a mismatch.  For BB
 763              vectorization only in case it will clearly improve things.  */
 764           if (i == commutative_op && !swapped
 765               && (!is_a <bb_vec_info> (vinfo)
 766                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 767                                              dts[i+1])
 768                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 769                           || vect_def_types_match
 770                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 771             {
 772               if (dump_enabled_p ())
 773                 dump_printf_loc (MSG_NOTE, vect_location,
 774                                  "trying swapped operands\n");
 775               std::swap (dts[i], dts[i+1]);
 776               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 777                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 778               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 779                          (*oprnds_info)[i+1]->ops[stmt_num]);
 780               swapped = true;
 781               continue;
 782             }
 783
 784           if (is_a <bb_vec_info> (vinfo)
 785               && !oprnd_info->any_pattern)
 786             {
 787               /* Now for commutative ops we should see whether we can
 788                  make the other operand matching.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "treating operand as external\n");
 792               oprnd_info->first_dt = dt = vect_external_def;
 793             }
 794           else
 795             {
 796               if (dump_enabled_p ())
 797                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 798                                  "Build SLP failed: different types\n");
 799               return 1;
 800             }
 801         }
 802
 803       /* Make sure to demote the overall operand to external.  */
 804       if (dt == vect_external_def)
 805         oprnd_info->first_dt = vect_external_def;
 806       /* For a SLP reduction chain we want to duplicate the reduction to
 807          each of the chain members.  That gets us a sane SLP graph (still
 808          the stmts are not 100% correct wrt the initial values).  */
 809       else if ((dt == vect_internal_def
 810                 || dt == vect_reduction_def)
 811                && oprnd_info->first_dt == vect_reduction_def
 812                && !STMT_VINFO_DATA_REF (stmt_info)
 813                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 814                && !STMT_VINFO_DATA_REF (def_stmt_info)
 815                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 816                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 817         {
 818           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 819           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 820         }
 821
 822       ++i;
 823     }
 824
 825   /* Swap operands.  */
 826   if (swapped)
 827     {
 828       if (dump_enabled_p ())
 829         dump_printf_loc (MSG_NOTE, vect_location,
 830                          "swapped operands to match def types in %G",
 831                          stmt_info->stmt);
 832     }
 833
 834   return 0;
 835 }
 836
 837 /* Return true if call statements CALL1 and CALL2 are similar enough
 838    to be combined into the same SLP group.  */
 839
 840 bool
 841 compatible_calls_p (gcall *call1, gcall *call2)
 842 {
 843   unsigned int nargs = gimple_call_num_args (call1);
 844   if (nargs != gimple_call_num_args (call2))
 845     return false;
 846
 847   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 848     return false;
 849
 850   if (gimple_call_internal_p (call1))
 851     {
 852       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 853                                TREE_TYPE (gimple_call_lhs (call2))))
 854         return false;
 855       for (unsigned int i = 0; i < nargs; ++i)
 856         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 857                                  TREE_TYPE (gimple_call_arg (call2, i))))
 858           return false;
 859     }
 860   else
 861     {
 862       if (!operand_equal_p (gimple_call_fn (call1),
 863                             gimple_call_fn (call2), 0))
 864         return false;
 865
 866       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 867         return false;
 868     }
 869
 870   /* Check that any unvectorized arguments are equal.  */
 871   if (const int *map = vect_get_operand_map (call1))
 872     {
 873       unsigned int nkept = *map++;
 874       unsigned int mapi = 0;
 875       for (unsigned int i = 0; i < nargs; ++i)
 876         if (mapi < nkept && map[mapi] == int (i))
 877           mapi += 1;
 878         else if (!operand_equal_p (gimple_call_arg (call1, i),
 879                                    gimple_call_arg (call2, i)))
 880           return false;
 881     }
 882
 883   return true;
 884 }
 885
 886 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 887    caller's attempt to find the vector type in STMT_INFO with the narrowest
 888    element type.  Return true if VECTYPE is nonnull and if it is valid
 889    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 890    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 891    vect_build_slp_tree.  */
 892
 893 static bool
 894 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 895                         unsigned int group_size,
 896                         tree vectype, poly_uint64 *max_nunits)
 897 {
 898   if (!vectype)
 899     {
 900       if (dump_enabled_p ())
 901         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                          "Build SLP failed: unsupported data-type in %G\n",
 903                          stmt_info->stmt);
 904       /* Fatal mismatch.  */
 905       return false;
 906     }
 907
 908   /* If populating the vector type requires unrolling then fail
 909      before adjusting *max_nunits for basic-block vectorization.  */
 910   if (is_a <bb_vec_info> (vinfo)
 911       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 912     {
 913       if (dump_enabled_p ())
 914         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                          "Build SLP failed: unrolling required "
 916                          "in basic block SLP\n");
 917       /* Fatal mismatch.  */
 918       return false;
 919     }
 920
 921   /* In case of multiple types we need to detect the smallest type.  */
 922   vect_update_max_nunits (max_nunits, vectype);
 923   return true;
 924 }
 925
 926 /* Verify if the scalar stmts STMTS are isomorphic, require data
 927    permutation or are of unsupported types of operation.  Return
 928    true if they are, otherwise return false and indicate in *MATCHES
 929    which stmts are not isomorphic to the first one.  If MATCHES[0]
 930    is false then this indicates the comparison could not be
 931    carried out or the stmts will never be vectorized by SLP.
 932
 933    Note COND_EXPR is possibly isomorphic to another one after swapping its
 934    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 935    the first stmt by swapping the two operands of comparison; set SWAP[i]
 936    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 937    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 938    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 939
 940 static bool
 941 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 942                        vec<stmt_vec_info> stmts, unsigned int group_size,
 943                        poly_uint64 *max_nunits, bool *matches,
 944                        bool *two_operators, tree *node_vectype)
 945 {
 946   unsigned int i;
 947   stmt_vec_info first_stmt_info = stmts[0];
 948   code_helper first_stmt_code = ERROR_MARK;
 949   code_helper alt_stmt_code = ERROR_MARK;
 950   code_helper rhs_code = ERROR_MARK;
 951   code_helper first_cond_code = ERROR_MARK;
 952   tree lhs;
 953   bool need_same_oprnds = false;
 954   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 955   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 956   bool first_stmt_load_p = false, load_p = false;
 957   bool first_stmt_phi_p = false, phi_p = false;
 958   bool maybe_soft_fail = false;
 959   tree soft_fail_nunits_vectype = NULL_TREE;
 960
 961   /* For every stmt in NODE find its def stmt/s.  */
 962   stmt_vec_info stmt_info;
 963   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 964     {
 965       gimple *stmt = stmt_info->stmt;
 966       swap[i] = 0;
 967       matches[i] = false;
 968
 969       if (dump_enabled_p ())
 970         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 971
 972       /* Fail to vectorize statements marked as unvectorizable, throw
 973          or are volatile.  */
 974       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
 975           || stmt_can_throw_internal (cfun, stmt)
 976           || gimple_has_volatile_ops (stmt))
 977         {
 978           if (dump_enabled_p ())
 979             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                              "Build SLP failed: unvectorizable statement %G",
 981                              stmt);
 982           /* ???  For BB vectorization we want to commutate operands in a way
 983              to shuffle all unvectorizable defs into one operand and have
 984              the other still vectorized.  The following doesn't reliably
 985              work for this though but it's the easiest we can do here.  */
 986           if (is_a <bb_vec_info> (vinfo) && i != 0)
 987             continue;
 988           /* Fatal mismatch.  */
 989           matches[0] = false;
 990           return false;
 991         }
 992
 993       lhs = gimple_get_lhs (stmt);
 994       if (lhs == NULL_TREE)
 995         {
 996           if (dump_enabled_p ())
 997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 998                              "Build SLP failed: not GIMPLE_ASSIGN nor "
 999                              "GIMPLE_CALL %G", stmt);
1000           if (is_a <bb_vec_info> (vinfo) && i != 0)
1001             continue;
1002           /* Fatal mismatch.  */
1003           matches[0] = false;
1004           return false;
1005         }
1006
1007       tree nunits_vectype;
1008       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1009                                            &nunits_vectype, group_size))
1010         {
1011           if (is_a <bb_vec_info> (vinfo) && i != 0)
1012             continue;
1013           /* Fatal mismatch.  */
1014           matches[0] = false;
1015           return false;
1016         }
1017       /* Record nunits required but continue analysis, producing matches[]
1018          as if nunits was not an issue.  This allows splitting of groups
1019          to happen.  */
1020       if (nunits_vectype
1021           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1022                                       nunits_vectype, max_nunits))
1023         {
1024           gcc_assert (is_a <bb_vec_info> (vinfo));
1025           maybe_soft_fail = true;
1026           soft_fail_nunits_vectype = nunits_vectype;
1027         }
1028
1029       gcc_assert (vectype);
1030
1031       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1032       if (call_stmt)
1033         {
1034           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1035           if (cfn != CFN_LAST)
1036             rhs_code = cfn;
1037           else
1038             rhs_code = CALL_EXPR;
1039
1040           if (cfn == CFN_MASK_LOAD
1041               || cfn == CFN_GATHER_LOAD
1042               || cfn == CFN_MASK_GATHER_LOAD)
1043             load_p = true;
1044           else if ((internal_fn_p (cfn)
1045                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1046                    || gimple_call_tail_p (call_stmt)
1047                    || gimple_call_noreturn_p (call_stmt)
1048                    || gimple_call_chain (call_stmt))
1049             {
1050               if (dump_enabled_p ())
1051                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052                                  "Build SLP failed: unsupported call type %G",
1053                                  (gimple *) call_stmt);
1054               if (is_a <bb_vec_info> (vinfo) && i != 0)
1055                 continue;
1056               /* Fatal mismatch.  */
1057               matches[0] = false;
1058               return false;
1059             }
1060         }
1061       else if (gimple_code (stmt) == GIMPLE_PHI)
1062         {
1063           rhs_code = ERROR_MARK;
1064           phi_p = true;
1065         }
1066       else
1067         {
1068           rhs_code = gimple_assign_rhs_code (stmt);
1069           load_p = gimple_vuse (stmt);
1070         }
1071
1072       /* Check the operation.  */
1073       if (i == 0)
1074         {
1075           *node_vectype = vectype;
1076           first_stmt_code = rhs_code;
1077           first_stmt_load_p = load_p;
1078           first_stmt_phi_p = phi_p;
1079
1080           /* Shift arguments should be equal in all the packed stmts for a
1081              vector shift with scalar shift operand.  */
1082           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1083               || rhs_code == LROTATE_EXPR
1084               || rhs_code == RROTATE_EXPR)
1085             {
1086               /* First see if we have a vector/vector shift.  */
1087               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1088                 {
1089                   /* No vector/vector shift, try for a vector/scalar shift.  */
1090                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1091                     {
1092                       if (dump_enabled_p ())
1093                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094                                          "Build SLP failed: "
1095                                          "op not supported by target.\n");
1096                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1097                         continue;
1098                       /* Fatal mismatch.  */
1099                       matches[0] = false;
1100                       return false;
1101                     }
1102                   need_same_oprnds = true;
1103                   first_op1 = gimple_assign_rhs2 (stmt);
1104                 }
1105             }
1106           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1107             {
1108               need_same_oprnds = true;
1109               first_op1 = gimple_assign_rhs2 (stmt);
1110             }
1111           else if (!load_p
1112                    && rhs_code == BIT_FIELD_REF)
1113             {
1114               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1115               if (!is_a <bb_vec_info> (vinfo)
1116                   || TREE_CODE (vec) != SSA_NAME
1117                   /* When the element types are not compatible we pun the
1118                      source to the target vectype which requires equal size.  */
1119                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1120                        || !types_compatible_p (TREE_TYPE (vectype),
1121                                                TREE_TYPE (TREE_TYPE (vec))))
1122                       && !operand_equal_p (TYPE_SIZE (vectype),
1123                                            TYPE_SIZE (TREE_TYPE (vec)))))
1124                 {
1125                   if (dump_enabled_p ())
1126                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                                      "Build SLP failed: "
1128                                      "BIT_FIELD_REF not supported\n");
1129                   /* Fatal mismatch.  */
1130                   matches[0] = false;
1131                   return false;
1132                 }
1133             }
1134           else if (rhs_code == CFN_DIV_POW2)
1135             {
1136               need_same_oprnds = true;
1137               first_op1 = gimple_call_arg (call_stmt, 1);
1138             }
1139         }
1140       else
1141         {
1142           if (first_stmt_code != rhs_code
1143               && alt_stmt_code == ERROR_MARK)
1144             alt_stmt_code = rhs_code;
1145           if ((first_stmt_code != rhs_code
1146                && (first_stmt_code != IMAGPART_EXPR
1147                    || rhs_code != REALPART_EXPR)
1148                && (first_stmt_code != REALPART_EXPR
1149                    || rhs_code != IMAGPART_EXPR)
1150                /* Handle mismatches in plus/minus by computing both
1151                   and merging the results.  */
1152                && !((first_stmt_code == PLUS_EXPR
1153                      || first_stmt_code == MINUS_EXPR)
1154                     && (alt_stmt_code == PLUS_EXPR
1155                         || alt_stmt_code == MINUS_EXPR)
1156                     && rhs_code == alt_stmt_code)
1157                && !(first_stmt_code.is_tree_code ()
1158                     && rhs_code.is_tree_code ()
1159                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1160                         == tcc_comparison)
1161                     && (swap_tree_comparison (tree_code (first_stmt_code))
1162                         == tree_code (rhs_code)))
1163                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1164                     && (first_stmt_code == ARRAY_REF
1165                         || first_stmt_code == BIT_FIELD_REF
1166                         || first_stmt_code == INDIRECT_REF
1167                         || first_stmt_code == COMPONENT_REF
1168                         || first_stmt_code == MEM_REF)
1169                     && (rhs_code == ARRAY_REF
1170                         || rhs_code == BIT_FIELD_REF
1171                         || rhs_code == INDIRECT_REF
1172                         || rhs_code == COMPONENT_REF
1173                         || rhs_code == MEM_REF)))
1174               || first_stmt_load_p != load_p
1175               || first_stmt_phi_p != phi_p)
1176             {
1177               if (dump_enabled_p ())
1178                 {
1179                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                                    "Build SLP failed: different operation "
1181                                    "in stmt %G", stmt);
1182                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1183                                    "original stmt %G", first_stmt_info->stmt);
1184                 }
1185               /* Mismatch.  */
1186               continue;
1187             }
1188
1189           if (!load_p
1190               && first_stmt_code == BIT_FIELD_REF
1191               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1192                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1193             {
1194               if (dump_enabled_p ())
1195                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                                  "Build SLP failed: different BIT_FIELD_REF "
1197                                  "arguments in %G", stmt);
1198               /* Mismatch.  */
1199               continue;
1200             }
1201
1202           if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1203             {
1204               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1205                                        call_stmt))
1206                 {
1207                   if (dump_enabled_p ())
1208                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                                      "Build SLP failed: different calls in %G",
1210                                      stmt);
1211                   /* Mismatch.  */
1212                   continue;
1213                 }
1214             }
1215
1216           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1217               && (gimple_bb (first_stmt_info->stmt)
1218                   != gimple_bb (stmt_info->stmt)))
1219             {
1220               if (dump_enabled_p ())
1221                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                                  "Build SLP failed: different BB for PHI "
1223                                  "or possibly trapping operation in %G", stmt);
1224               /* Mismatch.  */
1225               continue;
1226             }
1227
1228           if (need_same_oprnds)
1229             {
1230               tree other_op1 = gimple_arg (stmt, 1);
1231               if (!operand_equal_p (first_op1, other_op1, 0))
1232                 {
1233                   if (dump_enabled_p ())
1234                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                                      "Build SLP failed: different shift "
1236                                      "arguments in %G", stmt);
1237                   /* Mismatch.  */
1238                   continue;
1239                 }
1240             }
1241
1242           if (!types_compatible_p (vectype, *node_vectype))
1243             {
1244               if (dump_enabled_p ())
1245                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246                                  "Build SLP failed: different vector type "
1247                                  "in %G", stmt);
1248               /* Mismatch.  */
1249               continue;
1250             }
1251         }
1252
1253       /* Grouped store or load.  */
1254       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1255         {
1256           if (REFERENCE_CLASS_P (lhs))
1257             {
1258               /* Store.  */
1259               ;
1260             }
1261           else
1262             {
1263               /* Load.  */
1264               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1265               if (prev_first_load)
1266                 {
1267                   /* Check that there are no loads from different interleaving
1268                      chains in the same node.  */
1269                   if (prev_first_load != first_load)
1270                     {
1271                       if (dump_enabled_p ())
1272                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1273                                          vect_location,
1274                                          "Build SLP failed: different "
1275                                          "interleaving chains in one node %G",
1276                                          stmt);
1277                       /* Mismatch.  */
1278                       continue;
1279                     }
1280                 }
1281               else
1282                 prev_first_load = first_load;
1283            }
1284         } /* Grouped access.  */
1285       else
1286         {
1287           if (load_p
1288               && rhs_code != CFN_GATHER_LOAD
1289               && rhs_code != CFN_MASK_GATHER_LOAD)
1290             {
1291               /* Not grouped load.  */
1292               if (dump_enabled_p ())
1293                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294                                  "Build SLP failed: not grouped load %G", stmt);
1295
1296               /* FORNOW: Not grouped loads are not supported.  */
1297               if (is_a <bb_vec_info> (vinfo) && i != 0)
1298                 continue;
1299               /* Fatal mismatch.  */
1300               matches[0] = false;
1301               return false;
1302             }
1303
1304           /* Not memory operation.  */
1305           if (!phi_p
1306               && rhs_code.is_tree_code ()
1307               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1308               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1309               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1310               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1311               && rhs_code != VIEW_CONVERT_EXPR
1312               && rhs_code != CALL_EXPR
1313               && rhs_code != BIT_FIELD_REF)
1314             {
1315               if (dump_enabled_p ())
1316                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317                                  "Build SLP failed: operation unsupported %G",
1318                                  stmt);
1319               if (is_a <bb_vec_info> (vinfo) && i != 0)
1320                 continue;
1321               /* Fatal mismatch.  */
1322               matches[0] = false;
1323               return false;
1324             }
1325
1326           if (rhs_code == COND_EXPR)
1327             {
1328               tree cond_expr = gimple_assign_rhs1 (stmt);
1329               enum tree_code cond_code = TREE_CODE (cond_expr);
1330               enum tree_code swap_code = ERROR_MARK;
1331               enum tree_code invert_code = ERROR_MARK;
1332
1333               if (i == 0)
1334                 first_cond_code = TREE_CODE (cond_expr);
1335               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1336                 {
1337                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1338                   swap_code = swap_tree_comparison (cond_code);
1339                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1340                 }
1341
1342               if (first_cond_code == cond_code)
1343                 ;
1344               /* Isomorphic can be achieved by swapping.  */
1345               else if (first_cond_code == swap_code)
1346                 swap[i] = 1;
1347               /* Isomorphic can be achieved by inverting.  */
1348               else if (first_cond_code == invert_code)
1349                 swap[i] = 2;
1350               else
1351                 {
1352                   if (dump_enabled_p ())
1353                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354                                      "Build SLP failed: different"
1355                                      " operation %G", stmt);
1356                   /* Mismatch.  */
1357                   continue;
1358                 }
1359             }
1360
1361           if (rhs_code.is_tree_code ()
1362               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1363               && (swap_tree_comparison ((tree_code)first_stmt_code)
1364                   == (tree_code)rhs_code))
1365             swap[i] = 1;
1366         }
1367
1368       matches[i] = true;
1369     }
1370
1371   for (i = 0; i < group_size; ++i)
1372     if (!matches[i])
1373       return false;
1374
1375   /* If we allowed a two-operation SLP node verify the target can cope
1376      with the permute we are going to use.  */
1377   if (alt_stmt_code != ERROR_MARK
1378       && (!alt_stmt_code.is_tree_code ()
1379           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1380               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1381     {
1382       *two_operators = true;
1383     }
1384
1385   if (maybe_soft_fail)
1386     {
1387       unsigned HOST_WIDE_INT const_nunits;
1388       if (!TYPE_VECTOR_SUBPARTS
1389             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1390           || const_nunits > group_size)
1391         matches[0] = false;
1392       else
1393         {
1394           /* With constant vector elements simulate a mismatch at the
1395              point we need to split.  */
1396           unsigned tail = group_size & (const_nunits - 1);
1397           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1398         }
1399       return false;
1400     }
1401
1402   return true;
1403 }
1404
1405 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1406    Note we never remove apart from at destruction time so we do not
1407    need a special value for deleted that differs from empty.  */
1408 struct bst_traits
1409 {
1410   typedef vec <stmt_vec_info> value_type;
1411   typedef vec <stmt_vec_info> compare_type;
1412   static inline hashval_t hash (value_type);
1413   static inline bool equal (value_type existing, value_type candidate);
1414   static inline bool is_empty (value_type x) { return !x.exists (); }
1415   static inline bool is_deleted (value_type x) { return !x.exists (); }
1416   static const bool empty_zero_p = true;
1417   static inline void mark_empty (value_type &x) { x.release (); }
1418   static inline void mark_deleted (value_type &x) { x.release (); }
1419   static inline void remove (value_type &x) { x.release (); }
1420 };
1421 inline hashval_t
1422 bst_traits::hash (value_type x)
1423 {
1424   inchash::hash h;
1425   for (unsigned i = 0; i < x.length (); ++i)
1426     h.add_int (gimple_uid (x[i]->stmt));
1427   return h.end ();
1428 }
1429 inline bool
1430 bst_traits::equal (value_type existing, value_type candidate)
1431 {
1432   if (existing.length () != candidate.length ())
1433     return false;
1434   for (unsigned i = 0; i < existing.length (); ++i)
1435     if (existing[i] != candidate[i])
1436       return false;
1437   return true;
1438 }
1439
1440 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1441    but then vec::insert does memmove and that's not compatible with
1442    std::pair.  */
1443 struct chain_op_t
1444 {
1445   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1446       : code (code_), dt (dt_), op (op_) {}
1447   tree_code code;
1448   vect_def_type dt;
1449   tree op;
1450 };
1451
1452 /* Comparator for sorting associatable chains.  */
1453
1454 static int
1455 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1456 {
1457   auto *op1 = (const chain_op_t *) op1_;
1458   auto *op2 = (const chain_op_t *) op2_;
1459   if (op1->dt != op2->dt)
1460     return (int)op1->dt - (int)op2->dt;
1461   return (int)op1->code - (int)op2->code;
1462 }
1463
1464 /* Linearize the associatable expression chain at START with the
1465    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1466    filling CHAIN with the result and using WORKLIST as intermediate storage.
1467    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1468    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1469    stmts, starting with START.  */
1470
1471 static void
1472 vect_slp_linearize_chain (vec_info *vinfo,
1473                           vec<std::pair<tree_code, gimple *> > &worklist,
1474                           vec<chain_op_t> &chain,
1475                           enum tree_code code, gimple *start,
1476                           gimple *&code_stmt, gimple *&alt_code_stmt,
1477                           vec<gimple *> *chain_stmts)
1478 {
1479   /* For each lane linearize the addition/subtraction (or other
1480      uniform associatable operation) expression tree.  */
1481   worklist.safe_push (std::make_pair (code, start));
1482   while (!worklist.is_empty ())
1483     {
1484       auto entry = worklist.pop ();
1485       gassign *stmt = as_a <gassign *> (entry.second);
1486       enum tree_code in_code = entry.first;
1487       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1488       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1489       if (!code_stmt
1490           && gimple_assign_rhs_code (stmt) == code)
1491         code_stmt = stmt;
1492       else if (!alt_code_stmt
1493                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1494         alt_code_stmt = stmt;
1495       if (chain_stmts)
1496         chain_stmts->safe_push (stmt);
1497       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1498         {
1499           tree op = gimple_op (stmt, opnum);
1500           vect_def_type dt;
1501           stmt_vec_info def_stmt_info;
1502           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1503           gcc_assert (res);
1504           if (dt == vect_internal_def
1505               && is_pattern_stmt_p (def_stmt_info))
1506             op = gimple_get_lhs (def_stmt_info->stmt);
1507           gimple *use_stmt;
1508           use_operand_p use_p;
1509           if (dt == vect_internal_def
1510               && single_imm_use (op, &use_p, &use_stmt)
1511               && is_gimple_assign (def_stmt_info->stmt)
1512               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1513                   || (code == PLUS_EXPR
1514                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1515                           == MINUS_EXPR))))
1516             {
1517               tree_code op_def_code = this_code;
1518               if (op_def_code == MINUS_EXPR && opnum == 1)
1519                 op_def_code = PLUS_EXPR;
1520               if (in_code == MINUS_EXPR)
1521                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1522               worklist.safe_push (std::make_pair (op_def_code,
1523                                                   def_stmt_info->stmt));
1524             }
1525           else
1526             {
1527               tree_code op_def_code = this_code;
1528               if (op_def_code == MINUS_EXPR && opnum == 1)
1529                 op_def_code = PLUS_EXPR;
1530               if (in_code == MINUS_EXPR)
1531                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1532               chain.safe_push (chain_op_t (op_def_code, dt, op));
1533             }
1534         }
1535     }
1536 }
1537
1538 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1539                   simple_hashmap_traits <bst_traits, slp_tree> >
1540   scalar_stmts_to_slp_tree_map_t;
1541
1542 static slp_tree
1543 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1544                        vec<stmt_vec_info> stmts, unsigned int group_size,
1545                        poly_uint64 *max_nunits,
1546                        bool *matches, unsigned *limit, unsigned *tree_size,
1547                        scalar_stmts_to_slp_tree_map_t *bst_map);
1548
1549 static slp_tree
1550 vect_build_slp_tree (vec_info *vinfo,
1551                      vec<stmt_vec_info> stmts, unsigned int group_size,
1552                      poly_uint64 *max_nunits,
1553                      bool *matches, unsigned *limit, unsigned *tree_size,
1554                      scalar_stmts_to_slp_tree_map_t *bst_map)
1555 {
1556   if (slp_tree *leader = bst_map->get (stmts))
1557     {
1558       if (dump_enabled_p ())
1559         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1560                          !(*leader)->failed ? "" : "failed ",
1561                          (void *) *leader);
1562       if (!(*leader)->failed)
1563         {
1564           SLP_TREE_REF_COUNT (*leader)++;
1565           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1566           stmts.release ();
1567           return *leader;
1568         }
1569       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1570       return NULL;
1571     }
1572
1573   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1574      so we can pick up backedge destinations during discovery.  */
1575   slp_tree res = new _slp_tree;
1576   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1577   SLP_TREE_SCALAR_STMTS (res) = stmts;
1578   bst_map->put (stmts.copy (), res);
1579
1580   if (*limit == 0)
1581     {
1582       if (dump_enabled_p ())
1583         dump_printf_loc (MSG_NOTE, vect_location,
1584                          "SLP discovery limit exceeded\n");
1585       /* Mark the node invalid so we can detect those when still in use
1586          as backedge destinations.  */
1587       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1588       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1589       res->failed = XNEWVEC (bool, group_size);
1590       memset (res->failed, 0, sizeof (bool) * group_size);
1591       memset (matches, 0, sizeof (bool) * group_size);
1592       return NULL;
1593     }
1594   --*limit;
1595
1596   if (dump_enabled_p ())
1597     dump_printf_loc (MSG_NOTE, vect_location,
1598                      "starting SLP discovery for node %p\n", (void *) res);
1599
1600   poly_uint64 this_max_nunits = 1;
1601   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1602                                         &this_max_nunits,
1603                                         matches, limit, tree_size, bst_map);
1604   if (!res_)
1605     {
1606       if (dump_enabled_p ())
1607         dump_printf_loc (MSG_NOTE, vect_location,
1608                          "SLP discovery for node %p failed\n", (void *) res);
1609       /* Mark the node invalid so we can detect those when still in use
1610          as backedge destinations.  */
1611       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1612       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1613       res->failed = XNEWVEC (bool, group_size);
1614       if (flag_checking)
1615         {
1616           unsigned i;
1617           for (i = 0; i < group_size; ++i)
1618             if (!matches[i])
1619               break;
1620           gcc_assert (i < group_size);
1621         }
1622       memcpy (res->failed, matches, sizeof (bool) * group_size);
1623     }
1624   else
1625     {
1626       if (dump_enabled_p ())
1627         dump_printf_loc (MSG_NOTE, vect_location,
1628                          "SLP discovery for node %p succeeded\n",
1629                          (void *) res);
1630       gcc_assert (res_ == res);
1631       res->max_nunits = this_max_nunits;
1632       vect_update_max_nunits (max_nunits, this_max_nunits);
1633       /* Keep a reference for the bst_map use.  */
1634       SLP_TREE_REF_COUNT (res)++;
1635     }
1636   return res_;
1637 }
1638
1639 /* Helper for building an associated SLP node chain.  */
1640
1641 static void
1642 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1643                                    slp_tree op0, slp_tree op1,
1644                                    stmt_vec_info oper1, stmt_vec_info oper2,
1645                                    vec<std::pair<unsigned, unsigned> > lperm)
1646 {
1647   unsigned group_size = SLP_TREE_LANES (op1);
1648
1649   slp_tree child1 = new _slp_tree;
1650   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1651   SLP_TREE_VECTYPE (child1) = vectype;
1652   SLP_TREE_LANES (child1) = group_size;
1653   SLP_TREE_CHILDREN (child1).create (2);
1654   SLP_TREE_CHILDREN (child1).quick_push (op0);
1655   SLP_TREE_CHILDREN (child1).quick_push (op1);
1656   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1657
1658   slp_tree child2 = new _slp_tree;
1659   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1660   SLP_TREE_VECTYPE (child2) = vectype;
1661   SLP_TREE_LANES (child2) = group_size;
1662   SLP_TREE_CHILDREN (child2).create (2);
1663   SLP_TREE_CHILDREN (child2).quick_push (op0);
1664   SLP_TREE_REF_COUNT (op0)++;
1665   SLP_TREE_CHILDREN (child2).quick_push (op1);
1666   SLP_TREE_REF_COUNT (op1)++;
1667   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1668
1669   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1670   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1671   SLP_TREE_VECTYPE (perm) = vectype;
1672   SLP_TREE_LANES (perm) = group_size;
1673   /* ???  We should set this NULL but that's not expected.  */
1674   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1675   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1676   SLP_TREE_CHILDREN (perm).quick_push (child1);
1677   SLP_TREE_CHILDREN (perm).quick_push (child2);
1678 }
1679
1680 /* Recursively build an SLP tree starting from NODE.
1681    Fail (and return a value not equal to zero) if def-stmts are not
1682    isomorphic, require data permutation or are of unsupported types of
1683    operation.  Otherwise, return 0.
1684    The value returned is the depth in the SLP tree where a mismatch
1685    was found.  */
1686
1687 static slp_tree
1688 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1689                        vec<stmt_vec_info> stmts, unsigned int group_size,
1690                        poly_uint64 *max_nunits,
1691                        bool *matches, unsigned *limit, unsigned *tree_size,
1692                        scalar_stmts_to_slp_tree_map_t *bst_map)
1693 {
1694   unsigned nops, i, this_tree_size = 0;
1695   poly_uint64 this_max_nunits = *max_nunits;
1696
1697   matches[0] = false;
1698
1699   stmt_vec_info stmt_info = stmts[0];
1700   if (!is_a<gcall *> (stmt_info->stmt)
1701       && !is_a<gassign *> (stmt_info->stmt)
1702       && !is_a<gphi *> (stmt_info->stmt))
1703     return NULL;
1704
1705   nops = gimple_num_args (stmt_info->stmt);
1706   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1707     nops = map[0];
1708
1709   /* If the SLP node is a PHI (induction or reduction), terminate
1710      the recursion.  */
1711   bool *skip_args = XALLOCAVEC (bool, nops);
1712   memset (skip_args, 0, sizeof (bool) * nops);
1713   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1714     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1715       {
1716         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1717         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1718                                                     group_size);
1719         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1720                                      max_nunits))
1721           return NULL;
1722
1723         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1724         if (def_type == vect_induction_def)
1725           {
1726             /* Induction PHIs are not cycles but walk the initial
1727                value.  Only for inner loops through, for outer loops
1728                we need to pick up the value from the actual PHIs
1729                to more easily support peeling and epilogue vectorization.  */
1730             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1731             if (!nested_in_vect_loop_p (loop, stmt_info))
1732               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1733             else
1734               loop = loop->inner;
1735             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1736           }
1737         else if (def_type == vect_reduction_def
1738                  || def_type == vect_double_reduction_def
1739                  || def_type == vect_nested_cycle
1740                  || def_type == vect_first_order_recurrence)
1741           {
1742             /* Else def types have to match.  */
1743             stmt_vec_info other_info;
1744             bool all_same = true;
1745             FOR_EACH_VEC_ELT (stmts, i, other_info)
1746               {
1747                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1748                   return NULL;
1749                 if (other_info != stmt_info)
1750                   all_same = false;
1751               }
1752             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1753             /* Reduction initial values are not explicitely represented.  */
1754             if (def_type != vect_first_order_recurrence
1755                 && !nested_in_vect_loop_p (loop, stmt_info))
1756               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1757             /* Reduction chain backedge defs are filled manually.
1758                ???  Need a better way to identify a SLP reduction chain PHI.
1759                Or a better overall way to SLP match those.  */
1760             if (all_same && def_type == vect_reduction_def)
1761               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1762           }
1763         else if (def_type != vect_internal_def)
1764           return NULL;
1765       }
1766
1767
1768   bool two_operators = false;
1769   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1770   tree vectype = NULL_TREE;
1771   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1772                               &this_max_nunits, matches, &two_operators,
1773                               &vectype))
1774     return NULL;
1775
1776   /* If the SLP node is a load, terminate the recursion unless masked.  */
1777   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1778       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1779     {
1780       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1781         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1782                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1783                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1784       else
1785         {
1786           *max_nunits = this_max_nunits;
1787           (*tree_size)++;
1788           node = vect_create_new_slp_node (node, stmts, 0);
1789           SLP_TREE_VECTYPE (node) = vectype;
1790           /* And compute the load permutation.  Whether it is actually
1791              a permutation depends on the unrolling factor which is
1792              decided later.  */
1793           vec<unsigned> load_permutation;
1794           int j;
1795           stmt_vec_info load_info;
1796           load_permutation.create (group_size);
1797           stmt_vec_info first_stmt_info
1798             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1799           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1800             {
1801               int load_place = vect_get_place_in_interleaving_chain
1802                   (load_info, first_stmt_info);
1803               gcc_assert (load_place != -1);
1804               load_permutation.safe_push (load_place);
1805             }
1806           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1807           return node;
1808         }
1809     }
1810   else if (gimple_assign_single_p (stmt_info->stmt)
1811            && !gimple_vuse (stmt_info->stmt)
1812            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1813     {
1814       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1815          the same SSA name vector of a compatible type to vectype.  */
1816       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1817       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1818       stmt_vec_info estmt_info;
1819       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1820         {
1821           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1822           tree bfref = gimple_assign_rhs1 (estmt);
1823           HOST_WIDE_INT lane;
1824           if (!known_eq (bit_field_size (bfref),
1825                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1826               || !constant_multiple_p (bit_field_offset (bfref),
1827                                        bit_field_size (bfref), &lane))
1828             {
1829               lperm.release ();
1830               matches[0] = false;
1831               return NULL;
1832             }
1833           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1834         }
1835       slp_tree vnode = vect_create_new_slp_node (vNULL);
1836       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1837         /* ???  We record vectype here but we hide eventually necessary
1838            punning and instead rely on code generation to materialize
1839            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1840            this explicit somehow.  */
1841         SLP_TREE_VECTYPE (vnode) = vectype;
1842       else
1843         {
1844           /* For different size but compatible elements we can still
1845              use VEC_PERM_EXPR without punning.  */
1846           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1847                       && types_compatible_p (TREE_TYPE (vectype),
1848                                              TREE_TYPE (TREE_TYPE (vec))));
1849           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1850         }
1851       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1852       unsigned HOST_WIDE_INT const_nunits;
1853       if (nunits.is_constant (&const_nunits))
1854         SLP_TREE_LANES (vnode) = const_nunits;
1855       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1856       /* We are always building a permutation node even if it is an identity
1857          permute to shield the rest of the vectorizer from the odd node
1858          representing an actual vector without any scalar ops.
1859          ???  We could hide it completely with making the permute node
1860          external?  */
1861       node = vect_create_new_slp_node (node, stmts, 1);
1862       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1863       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1864       SLP_TREE_VECTYPE (node) = vectype;
1865       SLP_TREE_CHILDREN (node).quick_push (vnode);
1866       return node;
1867     }
1868   /* When discovery reaches an associatable operation see whether we can
1869      improve that to match up lanes in a way superior to the operand
1870      swapping code which at most looks at two defs.
1871      ???  For BB vectorization we cannot do the brute-force search
1872      for matching as we can succeed by means of builds from scalars
1873      and have no good way to "cost" one build against another.  */
1874   else if (is_a <loop_vec_info> (vinfo)
1875            /* ???  We don't handle !vect_internal_def defs below.  */
1876            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1877            && is_gimple_assign (stmt_info->stmt)
1878            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1879                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1880            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1881                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1882                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1883     {
1884       /* See if we have a chain of (mixed) adds or subtracts or other
1885          associatable ops.  */
1886       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1887       if (code == MINUS_EXPR)
1888         code = PLUS_EXPR;
1889       stmt_vec_info other_op_stmt_info = NULL;
1890       stmt_vec_info op_stmt_info = NULL;
1891       unsigned chain_len = 0;
1892       auto_vec<chain_op_t> chain;
1893       auto_vec<std::pair<tree_code, gimple *> > worklist;
1894       auto_vec<vec<chain_op_t> > chains (group_size);
1895       auto_vec<slp_tree, 4> children;
1896       bool hard_fail = true;
1897       for (unsigned lane = 0; lane < group_size; ++lane)
1898         {
1899           /* For each lane linearize the addition/subtraction (or other
1900              uniform associatable operation) expression tree.  */
1901           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1902           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1903                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1904                                     NULL);
1905           if (!op_stmt_info && op_stmt)
1906             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1907           if (!other_op_stmt_info && other_op_stmt)
1908             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1909           if (chain.length () == 2)
1910             {
1911               /* In a chain of just two elements resort to the regular
1912                  operand swapping scheme.  If we run into a length
1913                  mismatch still hard-FAIL.  */
1914               if (chain_len == 0)
1915                 hard_fail = false;
1916               else
1917                 {
1918                   matches[lane] = false;
1919                   /* ???  We might want to process the other lanes, but
1920                      make sure to not give false matching hints to the
1921                      caller for lanes we did not process.  */
1922                   if (lane != group_size - 1)
1923                     matches[0] = false;
1924                 }
1925               break;
1926             }
1927           else if (chain_len == 0)
1928             chain_len = chain.length ();
1929           else if (chain.length () != chain_len)
1930             {
1931               /* ???  Here we could slip in magic to compensate with
1932                  neutral operands.  */
1933               matches[lane] = false;
1934               if (lane != group_size - 1)
1935                 matches[0] = false;
1936               break;
1937             }
1938           chains.quick_push (chain.copy ());
1939           chain.truncate (0);
1940         }
1941       if (chains.length () == group_size)
1942         {
1943           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1944           if (!op_stmt_info)
1945             {
1946               hard_fail = false;
1947               goto out;
1948             }
1949           /* Now we have a set of chains with the same length.  */
1950           /* 1. pre-sort according to def_type and operation.  */
1951           for (unsigned lane = 0; lane < group_size; ++lane)
1952             chains[lane].stablesort (dt_sort_cmp, vinfo);
1953           if (dump_enabled_p ())
1954             {
1955               dump_printf_loc (MSG_NOTE, vect_location,
1956                                "pre-sorted chains of %s\n",
1957                                get_tree_code_name (code));
1958               for (unsigned lane = 0; lane < group_size; ++lane)
1959                 {
1960                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1961                     dump_printf (MSG_NOTE, "%s %T ",
1962                                  get_tree_code_name (chains[lane][opnum].code),
1963                                  chains[lane][opnum].op);
1964                   dump_printf (MSG_NOTE, "\n");
1965                 }
1966             }
1967           /* 2. try to build children nodes, associating as necessary.  */
1968           for (unsigned n = 0; n < chain_len; ++n)
1969             {
1970               vect_def_type dt = chains[0][n].dt;
1971               unsigned lane;
1972               for (lane = 0; lane < group_size; ++lane)
1973                 if (chains[lane][n].dt != dt)
1974                   {
1975                     if (dt == vect_constant_def
1976                         && chains[lane][n].dt == vect_external_def)
1977                       dt = vect_external_def;
1978                     else if (dt == vect_external_def
1979                              && chains[lane][n].dt == vect_constant_def)
1980                       ;
1981                     else
1982                       break;
1983                   }
1984               if (lane != group_size)
1985                 {
1986                   if (dump_enabled_p ())
1987                     dump_printf_loc (MSG_NOTE, vect_location,
1988                                      "giving up on chain due to mismatched "
1989                                      "def types\n");
1990                   matches[lane] = false;
1991                   if (lane != group_size - 1)
1992                     matches[0] = false;
1993                   goto out;
1994                 }
1995               if (dt == vect_constant_def
1996                   || dt == vect_external_def)
1997                 {
1998                   /* Check whether we can build the invariant.  If we can't
1999                      we never will be able to.  */
2000                   tree type = TREE_TYPE (chains[0][n].op);
2001                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2002                       && (TREE_CODE (type) == BOOLEAN_TYPE
2003                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2004                                                               type)))
2005                     {
2006                       matches[0] = false;
2007                       goto out;
2008                     }
2009                   vec<tree> ops;
2010                   ops.create (group_size);
2011                   for (lane = 0; lane < group_size; ++lane)
2012                     ops.quick_push (chains[lane][n].op);
2013                   slp_tree child = vect_create_new_slp_node (ops);
2014                   SLP_TREE_DEF_TYPE (child) = dt;
2015                   children.safe_push (child);
2016                 }
2017               else if (dt != vect_internal_def)
2018                 {
2019                   /* Not sure, we might need sth special.
2020                      gcc.dg/vect/pr96854.c,
2021                      gfortran.dg/vect/fast-math-pr37021.f90
2022                      and gfortran.dg/vect/pr61171.f trigger.  */
2023                   /* Soft-fail for now.  */
2024                   hard_fail = false;
2025                   goto out;
2026                 }
2027               else
2028                 {
2029                   vec<stmt_vec_info> op_stmts;
2030                   op_stmts.create (group_size);
2031                   slp_tree child = NULL;
2032                   /* Brute-force our way.  We have to consider a lane
2033                      failing after fixing an earlier fail up in the
2034                      SLP discovery recursion.  So track the current
2035                      permute per lane.  */
2036                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2037                   memset (perms, 0, sizeof (unsigned) * group_size);
2038                   do
2039                     {
2040                       op_stmts.truncate (0);
2041                       for (lane = 0; lane < group_size; ++lane)
2042                         op_stmts.quick_push
2043                           (vinfo->lookup_def (chains[lane][n].op));
2044                       child = vect_build_slp_tree (vinfo, op_stmts,
2045                                                    group_size, &this_max_nunits,
2046                                                    matches, limit,
2047                                                    &this_tree_size, bst_map);
2048                       /* ???  We're likely getting too many fatal mismatches
2049                          here so maybe we want to ignore them (but then we
2050                          have no idea which lanes fatally mismatched).  */
2051                       if (child || !matches[0])
2052                         break;
2053                       /* Swap another lane we have not yet matched up into
2054                          lanes that did not match.  If we run out of
2055                          permute possibilities for a lane terminate the
2056                          search.  */
2057                       bool term = false;
2058                       for (lane = 1; lane < group_size; ++lane)
2059                         if (!matches[lane])
2060                           {
2061                             if (n + perms[lane] + 1 == chain_len)
2062                               {
2063                                 term = true;
2064                                 break;
2065                               }
2066                             std::swap (chains[lane][n],
2067                                        chains[lane][n + perms[lane] + 1]);
2068                             perms[lane]++;
2069                           }
2070                       if (term)
2071                         break;
2072                     }
2073                   while (1);
2074                   if (!child)
2075                     {
2076                       if (dump_enabled_p ())
2077                         dump_printf_loc (MSG_NOTE, vect_location,
2078                                          "failed to match up op %d\n", n);
2079                       op_stmts.release ();
2080                       if (lane != group_size - 1)
2081                         matches[0] = false;
2082                       else
2083                         matches[lane] = false;
2084                       goto out;
2085                     }
2086                   if (dump_enabled_p ())
2087                     {
2088                       dump_printf_loc (MSG_NOTE, vect_location,
2089                                        "matched up op %d to\n", n);
2090                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2091                     }
2092                   children.safe_push (child);
2093                 }
2094             }
2095           /* 3. build SLP nodes to combine the chain.  */
2096           for (unsigned lane = 0; lane < group_size; ++lane)
2097             if (chains[lane][0].code != code)
2098               {
2099                 /* See if there's any alternate all-PLUS entry.  */
2100                 unsigned n;
2101                 for (n = 1; n < chain_len; ++n)
2102                   {
2103                     for (lane = 0; lane < group_size; ++lane)
2104                       if (chains[lane][n].code != code)
2105                         break;
2106                     if (lane == group_size)
2107                       break;
2108                   }
2109                 if (n != chain_len)
2110                   {
2111                     /* Swap that in at first position.  */
2112                     std::swap (children[0], children[n]);
2113                     for (lane = 0; lane < group_size; ++lane)
2114                       std::swap (chains[lane][0], chains[lane][n]);
2115                   }
2116                 else
2117                   {
2118                     /* ???  When this triggers and we end up with two
2119                        vect_constant/external_def up-front things break (ICE)
2120                        spectacularly finding an insertion place for the
2121                        all-constant op.  We should have a fully
2122                        vect_internal_def operand though(?) so we can swap
2123                        that into first place and then prepend the all-zero
2124                        constant.  */
2125                     if (dump_enabled_p ())
2126                       dump_printf_loc (MSG_NOTE, vect_location,
2127                                        "inserting constant zero to compensate "
2128                                        "for (partially) negated first "
2129                                        "operand\n");
2130                     chain_len++;
2131                     for (lane = 0; lane < group_size; ++lane)
2132                       chains[lane].safe_insert
2133                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2134                     vec<tree> zero_ops;
2135                     zero_ops.create (group_size);
2136                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2137                     for (lane = 1; lane < group_size; ++lane)
2138                       zero_ops.quick_push (zero_ops[0]);
2139                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2140                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2141                     children.safe_insert (0, zero);
2142                   }
2143                 break;
2144               }
2145           for (unsigned i = 1; i < children.length (); ++i)
2146             {
2147               slp_tree op0 = children[i - 1];
2148               slp_tree op1 = children[i];
2149               bool this_two_op = false;
2150               for (unsigned lane = 0; lane < group_size; ++lane)
2151                 if (chains[lane][i].code != chains[0][i].code)
2152                   {
2153                     this_two_op = true;
2154                     break;
2155                   }
2156               slp_tree child;
2157               if (i == children.length () - 1)
2158                 child = vect_create_new_slp_node (node, stmts, 2);
2159               else
2160                 child = vect_create_new_slp_node (2, ERROR_MARK);
2161               if (this_two_op)
2162                 {
2163                   vec<std::pair<unsigned, unsigned> > lperm;
2164                   lperm.create (group_size);
2165                   for (unsigned lane = 0; lane < group_size; ++lane)
2166                     lperm.quick_push (std::make_pair
2167                       (chains[lane][i].code != chains[0][i].code, lane));
2168                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2169                                                      (chains[0][i].code == code
2170                                                       ? op_stmt_info
2171                                                       : other_op_stmt_info),
2172                                                      (chains[0][i].code == code
2173                                                       ? other_op_stmt_info
2174                                                       : op_stmt_info),
2175                                                      lperm);
2176                 }
2177               else
2178                 {
2179                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2180                   SLP_TREE_VECTYPE (child) = vectype;
2181                   SLP_TREE_LANES (child) = group_size;
2182                   SLP_TREE_CHILDREN (child).quick_push (op0);
2183                   SLP_TREE_CHILDREN (child).quick_push (op1);
2184                   SLP_TREE_REPRESENTATIVE (child)
2185                     = (chains[0][i].code == code
2186                        ? op_stmt_info : other_op_stmt_info);
2187                 }
2188               children[i] = child;
2189             }
2190           *tree_size += this_tree_size + 1;
2191           *max_nunits = this_max_nunits;
2192           while (!chains.is_empty ())
2193             chains.pop ().release ();
2194           return node;
2195         }
2196 out:
2197       while (!children.is_empty ())
2198         vect_free_slp_tree (children.pop ());
2199       while (!chains.is_empty ())
2200         chains.pop ().release ();
2201       /* Hard-fail, otherwise we might run into quadratic processing of the
2202          chains starting one stmt into the chain again.  */
2203       if (hard_fail)
2204         return NULL;
2205       /* Fall thru to normal processing.  */
2206     }
2207
2208   /* Get at the operands, verifying they are compatible.  */
2209   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2210   slp_oprnd_info oprnd_info;
2211   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2212     {
2213       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2214                                              stmts, i, &oprnds_info);
2215       if (res != 0)
2216         matches[(res == -1) ? 0 : i] = false;
2217       if (!matches[0])
2218         break;
2219     }
2220   for (i = 0; i < group_size; ++i)
2221     if (!matches[i])
2222       {
2223         vect_free_oprnd_info (oprnds_info);
2224         return NULL;
2225       }
2226   swap = NULL;
2227
2228   auto_vec<slp_tree, 4> children;
2229
2230   stmt_info = stmts[0];
2231
2232   /* Create SLP_TREE nodes for the definition node/s.  */
2233   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2234     {
2235       slp_tree child;
2236       unsigned int j;
2237
2238       /* We're skipping certain operands from processing, for example
2239          outer loop reduction initial defs.  */
2240       if (skip_args[i])
2241         {
2242           children.safe_push (NULL);
2243           continue;
2244         }
2245
2246       if (oprnd_info->first_dt == vect_uninitialized_def)
2247         {
2248           /* COND_EXPR have one too many eventually if the condition
2249              is a SSA name.  */
2250           gcc_assert (i == 3 && nops == 4);
2251           continue;
2252         }
2253
2254       if (is_a <bb_vec_info> (vinfo)
2255           && oprnd_info->first_dt == vect_internal_def
2256           && !oprnd_info->any_pattern)
2257         {
2258           /* For BB vectorization, if all defs are the same do not
2259              bother to continue the build along the single-lane
2260              graph but use a splat of the scalar value.  */
2261           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2262           for (j = 1; j < group_size; ++j)
2263             if (oprnd_info->def_stmts[j] != first_def)
2264               break;
2265           if (j == group_size
2266               /* But avoid doing this for loads where we may be
2267                  able to CSE things, unless the stmt is not
2268                  vectorizable.  */
2269               && (!STMT_VINFO_VECTORIZABLE (first_def)
2270                   || !gimple_vuse (first_def->stmt)))
2271             {
2272               if (dump_enabled_p ())
2273                 dump_printf_loc (MSG_NOTE, vect_location,
2274                                  "Using a splat of the uniform operand %G",
2275                                  first_def->stmt);
2276               oprnd_info->first_dt = vect_external_def;
2277             }
2278         }
2279
2280       if (oprnd_info->first_dt == vect_external_def
2281           || oprnd_info->first_dt == vect_constant_def)
2282         {
2283           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2284           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2285           oprnd_info->ops = vNULL;
2286           children.safe_push (invnode);
2287           continue;
2288         }
2289
2290       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2291                                         group_size, &this_max_nunits,
2292                                         matches, limit,
2293                                         &this_tree_size, bst_map)) != NULL)
2294         {
2295           oprnd_info->def_stmts = vNULL;
2296           children.safe_push (child);
2297           continue;
2298         }
2299
2300       /* If the SLP build for operand zero failed and operand zero
2301          and one can be commutated try that for the scalar stmts
2302          that failed the match.  */
2303       if (i == 0
2304           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2305           && matches[0]
2306           /* ???  For COND_EXPRs we can swap the comparison operands
2307              as well as the arms under some constraints.  */
2308           && nops == 2
2309           && oprnds_info[1]->first_dt == vect_internal_def
2310           && is_gimple_assign (stmt_info->stmt)
2311           /* Swapping operands for reductions breaks assumptions later on.  */
2312           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2313           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2314         {
2315           /* See whether we can swap the matching or the non-matching
2316              stmt operands.  */
2317           bool swap_not_matching = true;
2318           do
2319             {
2320               for (j = 0; j < group_size; ++j)
2321                 {
2322                   if (matches[j] != !swap_not_matching)
2323                     continue;
2324                   stmt_vec_info stmt_info = stmts[j];
2325                   /* Verify if we can swap operands of this stmt.  */
2326                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2327                   if (!stmt
2328                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2329                     {
2330                       if (!swap_not_matching)
2331                         goto fail;
2332                       swap_not_matching = false;
2333                       break;
2334                     }
2335                 }
2336             }
2337           while (j != group_size);
2338
2339           /* Swap mismatched definition stmts.  */
2340           if (dump_enabled_p ())
2341             dump_printf_loc (MSG_NOTE, vect_location,
2342                              "Re-trying with swapped operands of stmts ");
2343           for (j = 0; j < group_size; ++j)
2344             if (matches[j] == !swap_not_matching)
2345               {
2346                 std::swap (oprnds_info[0]->def_stmts[j],
2347                            oprnds_info[1]->def_stmts[j]);
2348                 std::swap (oprnds_info[0]->ops[j],
2349                            oprnds_info[1]->ops[j]);
2350                 if (dump_enabled_p ())
2351                   dump_printf (MSG_NOTE, "%d ", j);
2352               }
2353           if (dump_enabled_p ())
2354             dump_printf (MSG_NOTE, "\n");
2355           /* After swapping some operands we lost track whether an
2356              operand has any pattern defs so be conservative here.  */
2357           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2358             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2359           /* And try again with scratch 'matches' ... */
2360           bool *tem = XALLOCAVEC (bool, group_size);
2361           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2362                                             group_size, &this_max_nunits,
2363                                             tem, limit,
2364                                             &this_tree_size, bst_map)) != NULL)
2365             {
2366               oprnd_info->def_stmts = vNULL;
2367               children.safe_push (child);
2368               continue;
2369             }
2370         }
2371 fail:
2372
2373       /* If the SLP build failed and we analyze a basic-block
2374          simply treat nodes we fail to build as externally defined
2375          (and thus build vectors from the scalar defs).
2376          The cost model will reject outright expensive cases.
2377          ???  This doesn't treat cases where permutation ultimatively
2378          fails (or we don't try permutation below).  Ideally we'd
2379          even compute a permutation that will end up with the maximum
2380          SLP tree size...  */
2381       if (is_a <bb_vec_info> (vinfo)
2382           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2383              do extra work to cancel the pattern so the uses see the
2384              scalar version.  */
2385           && !is_pattern_stmt_p (stmt_info)
2386           && !oprnd_info->any_pattern)
2387         {
2388           /* But if there's a leading vector sized set of matching stmts
2389              fail here so we can split the group.  This matches the condition
2390              vect_analyze_slp_instance uses.  */
2391           /* ???  We might want to split here and combine the results to support
2392              multiple vector sizes better.  */
2393           for (j = 0; j < group_size; ++j)
2394             if (!matches[j])
2395               break;
2396           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2397             {
2398               if (dump_enabled_p ())
2399                 dump_printf_loc (MSG_NOTE, vect_location,
2400                                  "Building vector operands from scalars\n");
2401               this_tree_size++;
2402               child = vect_create_new_slp_node (oprnd_info->ops);
2403               children.safe_push (child);
2404               oprnd_info->ops = vNULL;
2405               continue;
2406             }
2407         }
2408
2409       gcc_assert (child == NULL);
2410       FOR_EACH_VEC_ELT (children, j, child)
2411         if (child)
2412           vect_free_slp_tree (child);
2413       vect_free_oprnd_info (oprnds_info);
2414       return NULL;
2415     }
2416
2417   vect_free_oprnd_info (oprnds_info);
2418
2419   /* If we have all children of a child built up from uniform scalars
2420      or does more than one possibly expensive vector construction then
2421      just throw that away, causing it built up from scalars.
2422      The exception is the SLP node for the vector store.  */
2423   if (is_a <bb_vec_info> (vinfo)
2424       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2425       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2426          do extra work to cancel the pattern so the uses see the
2427          scalar version.  */
2428       && !is_pattern_stmt_p (stmt_info))
2429     {
2430       slp_tree child;
2431       unsigned j;
2432       bool all_uniform_p = true;
2433       unsigned n_vector_builds = 0;
2434       FOR_EACH_VEC_ELT (children, j, child)
2435         {
2436           if (!child)
2437             ;
2438           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2439             all_uniform_p = false;
2440           else if (!vect_slp_tree_uniform_p (child))
2441             {
2442               all_uniform_p = false;
2443               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2444                 n_vector_builds++;
2445             }
2446         }
2447       if (all_uniform_p
2448           || n_vector_builds > 1
2449           || (n_vector_builds == children.length ()
2450               && is_a <gphi *> (stmt_info->stmt)))
2451         {
2452           /* Roll back.  */
2453           matches[0] = false;
2454           FOR_EACH_VEC_ELT (children, j, child)
2455             if (child)
2456               vect_free_slp_tree (child);
2457
2458           if (dump_enabled_p ())
2459             dump_printf_loc (MSG_NOTE, vect_location,
2460                              "Building parent vector operands from "
2461                              "scalars instead\n");
2462           return NULL;
2463         }
2464     }
2465
2466   *tree_size += this_tree_size + 1;
2467   *max_nunits = this_max_nunits;
2468
2469   if (two_operators)
2470     {
2471       /* ???  We'd likely want to either cache in bst_map sth like
2472          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2473          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2474          explicit stmts to put in so the keying on 'stmts' doesn't
2475          work (but we have the same issue with nodes that use 'ops').  */
2476       slp_tree one = new _slp_tree;
2477       slp_tree two = new _slp_tree;
2478       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2479       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2480       SLP_TREE_VECTYPE (one) = vectype;
2481       SLP_TREE_VECTYPE (two) = vectype;
2482       SLP_TREE_CHILDREN (one).safe_splice (children);
2483       SLP_TREE_CHILDREN (two).safe_splice (children);
2484       slp_tree child;
2485       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2486         SLP_TREE_REF_COUNT (child)++;
2487
2488       /* Here we record the original defs since this
2489          node represents the final lane configuration.  */
2490       node = vect_create_new_slp_node (node, stmts, 2);
2491       SLP_TREE_VECTYPE (node) = vectype;
2492       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2493       SLP_TREE_CHILDREN (node).quick_push (one);
2494       SLP_TREE_CHILDREN (node).quick_push (two);
2495       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2496       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2497       enum tree_code ocode = ERROR_MARK;
2498       stmt_vec_info ostmt_info;
2499       unsigned j = 0;
2500       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2501         {
2502           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2503           if (gimple_assign_rhs_code (ostmt) != code0)
2504             {
2505               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2506               ocode = gimple_assign_rhs_code (ostmt);
2507               j = i;
2508             }
2509           else
2510             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2511         }
2512       SLP_TREE_CODE (one) = code0;
2513       SLP_TREE_CODE (two) = ocode;
2514       SLP_TREE_LANES (one) = stmts.length ();
2515       SLP_TREE_LANES (two) = stmts.length ();
2516       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2517       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2518       return node;
2519     }
2520
2521   node = vect_create_new_slp_node (node, stmts, nops);
2522   SLP_TREE_VECTYPE (node) = vectype;
2523   SLP_TREE_CHILDREN (node).splice (children);
2524   return node;
2525 }
2526
2527 /* Dump a single SLP tree NODE.  */
2528
2529 static void
2530 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2531                      slp_tree node)
2532 {
2533   unsigned i, j;
2534   slp_tree child;
2535   stmt_vec_info stmt_info;
2536   tree op;
2537
2538   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2539   dump_user_location_t user_loc = loc.get_user_location ();
2540   dump_printf_loc (metadata, user_loc,
2541                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2542                    ", refcnt=%u)",
2543                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2544                    ? " (external)"
2545                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2546                       ? " (constant)"
2547                       : ""), (void *) node,
2548                    estimated_poly_value (node->max_nunits),
2549                                          SLP_TREE_REF_COUNT (node));
2550   if (SLP_TREE_VECTYPE (node))
2551     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2552   dump_printf (metadata, "\n");
2553   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2554     {
2555       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2556         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2557       else
2558         dump_printf_loc (metadata, user_loc, "op template: %G",
2559                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2560     }
2561   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2562     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2563       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2564   else
2565     {
2566       dump_printf_loc (metadata, user_loc, "\t{ ");
2567       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2568         dump_printf (metadata, "%T%s ", op,
2569                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2570       dump_printf (metadata, "}\n");
2571     }
2572   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2573     {
2574       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2575       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2576         dump_printf (dump_kind, " %u", j);
2577       dump_printf (dump_kind, " }\n");
2578     }
2579   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2580     {
2581       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2582       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2583         dump_printf (dump_kind, " %u[%u]",
2584                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2585                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2586       dump_printf (dump_kind, " }\n");
2587     }
2588   if (SLP_TREE_CHILDREN (node).is_empty ())
2589     return;
2590   dump_printf_loc (metadata, user_loc, "\tchildren");
2591   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2592     dump_printf (dump_kind, " %p", (void *)child);
2593   dump_printf (dump_kind, "\n");
2594 }
2595
2596 DEBUG_FUNCTION void
2597 debug (slp_tree node)
2598 {
2599   debug_dump_context ctx;
2600   vect_print_slp_tree (MSG_NOTE,
2601                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2602                        node);
2603 }
2604
2605 /* Recursive helper for the dot producer below.  */
2606
2607 static void
2608 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2609 {
2610   if (visited.add (node))
2611     return;
2612
2613   fprintf (f, "\"%p\" [label=\"", (void *)node);
2614   vect_print_slp_tree (MSG_NOTE,
2615                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2616                        node);
2617   fprintf (f, "\"];\n");
2618
2619
2620   for (slp_tree child : SLP_TREE_CHILDREN (node))
2621     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2622
2623   for (slp_tree child : SLP_TREE_CHILDREN (node))
2624     if (child)
2625       dot_slp_tree (f, child, visited);
2626 }
2627
2628 DEBUG_FUNCTION void
2629 dot_slp_tree (const char *fname, slp_tree node)
2630 {
2631   FILE *f = fopen (fname, "w");
2632   fprintf (f, "digraph {\n");
2633   fflush (f);
2634     {
2635       debug_dump_context ctx (f);
2636       hash_set<slp_tree> visited;
2637       dot_slp_tree (f, node, visited);
2638     }
2639   fflush (f);
2640   fprintf (f, "}\n");
2641   fclose (f);
2642 }
2643
2644 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2645
2646 static void
2647 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2648                       slp_tree node, hash_set<slp_tree> &visited)
2649 {
2650   unsigned i;
2651   slp_tree child;
2652
2653   if (visited.add (node))
2654     return;
2655
2656   vect_print_slp_tree (dump_kind, loc, node);
2657
2658   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2659     if (child)
2660       vect_print_slp_graph (dump_kind, loc, child, visited);
2661 }
2662
2663 static void
2664 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2665                       slp_tree entry)
2666 {
2667   hash_set<slp_tree> visited;
2668   vect_print_slp_graph (dump_kind, loc, entry, visited);
2669 }
2670
2671 /* Mark the tree rooted at NODE with PURE_SLP.  */
2672
2673 static void
2674 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2675 {
2676   int i;
2677   stmt_vec_info stmt_info;
2678   slp_tree child;
2679
2680   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2681     return;
2682
2683   if (visited.add (node))
2684     return;
2685
2686   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2687     STMT_SLP_TYPE (stmt_info) = pure_slp;
2688
2689   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2690     if (child)
2691       vect_mark_slp_stmts (child, visited);
2692 }
2693
2694 static void
2695 vect_mark_slp_stmts (slp_tree node)
2696 {
2697   hash_set<slp_tree> visited;
2698   vect_mark_slp_stmts (node, visited);
2699 }
2700
2701 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2702
2703 static void
2704 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2705 {
2706   int i;
2707   stmt_vec_info stmt_info;
2708   slp_tree child;
2709
2710   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2711     return;
2712
2713   if (visited.add (node))
2714     return;
2715
2716   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2717     {
2718       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2719                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2720       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2721     }
2722
2723   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2724     if (child)
2725       vect_mark_slp_stmts_relevant (child, visited);
2726 }
2727
2728 static void
2729 vect_mark_slp_stmts_relevant (slp_tree node)
2730 {
2731   hash_set<slp_tree> visited;
2732   vect_mark_slp_stmts_relevant (node, visited);
2733 }
2734
2735
2736 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2737
2738 static void
2739 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2740                        hash_set<slp_tree> &visited)
2741 {
2742   if (!node || visited.add (node))
2743     return;
2744
2745   if (SLP_TREE_CHILDREN (node).length () == 0)
2746     {
2747       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2748         return;
2749       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2750       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2751           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2752         loads.safe_push (node);
2753     }
2754   else
2755     {
2756       unsigned i;
2757       slp_tree child;
2758       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2759         vect_gather_slp_loads (loads, child, visited);
2760     }
2761 }
2762
2763
2764 /* Find the last store in SLP INSTANCE.  */
2765
2766 stmt_vec_info
2767 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2768 {
2769   stmt_vec_info last = NULL;
2770   stmt_vec_info stmt_vinfo;
2771
2772   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2773     {
2774       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2775       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2776     }
2777
2778   return last;
2779 }
2780
2781 /* Find the first stmt in NODE.  */
2782
2783 stmt_vec_info
2784 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2785 {
2786   stmt_vec_info first = NULL;
2787   stmt_vec_info stmt_vinfo;
2788
2789   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2790     {
2791       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2792       if (!first
2793           || get_later_stmt (stmt_vinfo, first) == first)
2794         first = stmt_vinfo;
2795     }
2796
2797   return first;
2798 }
2799
2800 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2801    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2802    (also containing the first GROUP1_SIZE stmts, since stores are
2803    consecutive), the second containing the remainder.
2804    Return the first stmt in the second group.  */
2805
2806 static stmt_vec_info
2807 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2808 {
2809   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2810   gcc_assert (group1_size > 0);
2811   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2812   gcc_assert (group2_size > 0);
2813   DR_GROUP_SIZE (first_vinfo) = group1_size;
2814
2815   stmt_vec_info stmt_info = first_vinfo;
2816   for (unsigned i = group1_size; i > 1; i--)
2817     {
2818       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2819       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2820     }
2821   /* STMT is now the last element of the first group.  */
2822   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2823   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2824
2825   DR_GROUP_SIZE (group2) = group2_size;
2826   for (stmt_info = group2; stmt_info;
2827        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2828     {
2829       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2830       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2831     }
2832
2833   /* For the second group, the DR_GROUP_GAP is that before the original group,
2834      plus skipping over the first vector.  */
2835   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2836
2837   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2838   DR_GROUP_GAP (first_vinfo) += group2_size;
2839
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2842                      group1_size, group2_size);
2843
2844   return group2;
2845 }
2846
2847 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2848    statements and a vector of NUNITS elements.  */
2849
2850 static poly_uint64
2851 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2852 {
2853   return exact_div (common_multiple (nunits, group_size), group_size);
2854 }
2855
2856 /* Helper that checks to see if a node is a load node.  */
2857
2858 static inline bool
2859 vect_is_slp_load_node  (slp_tree root)
2860 {
2861   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2862          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2863          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2864 }
2865
2866
2867 /* Helper function of optimize_load_redistribution that performs the operation
2868    recursively.  */
2869
2870 static slp_tree
2871 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2872                                 vec_info *vinfo, unsigned int group_size,
2873                                 hash_map<slp_tree, slp_tree> *load_map,
2874                                 slp_tree root)
2875 {
2876   if (slp_tree *leader = load_map->get (root))
2877     return *leader;
2878
2879   slp_tree node;
2880   unsigned i;
2881
2882   /* For now, we don't know anything about externals so do not do anything.  */
2883   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2884     return NULL;
2885   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2886     {
2887       /* First convert this node into a load node and add it to the leaves
2888          list and flatten the permute from a lane to a load one.  If it's
2889          unneeded it will be elided later.  */
2890       vec<stmt_vec_info> stmts;
2891       stmts.create (SLP_TREE_LANES (root));
2892       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2893       for (unsigned j = 0; j < lane_perm.length (); j++)
2894         {
2895           std::pair<unsigned, unsigned> perm = lane_perm[j];
2896           node = SLP_TREE_CHILDREN (root)[perm.first];
2897
2898           if (!vect_is_slp_load_node (node)
2899               || SLP_TREE_CHILDREN (node).exists ())
2900             {
2901               stmts.release ();
2902               goto next;
2903             }
2904
2905           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2906         }
2907
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_NOTE, vect_location,
2910                          "converting stmts on permute node %p\n",
2911                          (void *) root);
2912
2913       bool *matches = XALLOCAVEC (bool, group_size);
2914       poly_uint64 max_nunits = 1;
2915       unsigned tree_size = 0, limit = 1;
2916       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2917                                   matches, &limit, &tree_size, bst_map);
2918       if (!node)
2919         stmts.release ();
2920
2921       load_map->put (root, node);
2922       return node;
2923     }
2924
2925 next:
2926   load_map->put (root, NULL);
2927
2928   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2929     {
2930       slp_tree value
2931         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2932                                           node);
2933       if (value)
2934         {
2935           SLP_TREE_REF_COUNT (value)++;
2936           SLP_TREE_CHILDREN (root)[i] = value;
2937           /* ???  We know the original leafs of the replaced nodes will
2938              be referenced by bst_map, only the permutes created by
2939              pattern matching are not.  */
2940           if (SLP_TREE_REF_COUNT (node) == 1)
2941             load_map->remove (node);
2942           vect_free_slp_tree (node);
2943         }
2944     }
2945
2946   return NULL;
2947 }
2948
2949 /* Temporary workaround for loads not being CSEd during SLP build.  This
2950    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2951    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2952    same DR such that the final operation is equal to a permuted load.  Such
2953    NODES are then directly converted into LOADS themselves.  The nodes are
2954    CSEd using BST_MAP.  */
2955
2956 static void
2957 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2958                               vec_info *vinfo, unsigned int group_size,
2959                               hash_map<slp_tree, slp_tree> *load_map,
2960                               slp_tree root)
2961 {
2962   slp_tree node;
2963   unsigned i;
2964
2965   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2966     {
2967       slp_tree value
2968         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2969                                           node);
2970       if (value)
2971         {
2972           SLP_TREE_REF_COUNT (value)++;
2973           SLP_TREE_CHILDREN (root)[i] = value;
2974           /* ???  We know the original leafs of the replaced nodes will
2975              be referenced by bst_map, only the permutes created by
2976              pattern matching are not.  */
2977           if (SLP_TREE_REF_COUNT (node) == 1)
2978             load_map->remove (node);
2979           vect_free_slp_tree (node);
2980         }
2981     }
2982 }
2983
2984 /* Helper function of vect_match_slp_patterns.
2985
2986    Attempts to match patterns against the slp tree rooted in REF_NODE using
2987    VINFO.  Patterns are matched in post-order traversal.
2988
2989    If matching is successful the value in REF_NODE is updated and returned, if
2990    not then it is returned unchanged.  */
2991
2992 static bool
2993 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2994                            slp_tree_to_load_perm_map_t *perm_cache,
2995                            slp_compat_nodes_map_t *compat_cache,
2996                            hash_set<slp_tree> *visited)
2997 {
2998   unsigned i;
2999   slp_tree node = *ref_node;
3000   bool found_p = false;
3001   if (!node || visited->add (node))
3002     return false;
3003
3004   slp_tree child;
3005   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3006     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3007                                           vinfo, perm_cache, compat_cache,
3008                                           visited);
3009
3010   for (unsigned x = 0; x < num__slp_patterns; x++)
3011     {
3012       vect_pattern *pattern
3013         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3014       if (pattern)
3015         {
3016           pattern->build (vinfo);
3017           delete pattern;
3018           found_p = true;
3019         }
3020     }
3021
3022   return found_p;
3023 }
3024
3025 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3026    vec_info VINFO.
3027
3028    The modified tree is returned.  Patterns are tried in order and multiple
3029    patterns may match.  */
3030
3031 static bool
3032 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3033                          hash_set<slp_tree> *visited,
3034                          slp_tree_to_load_perm_map_t *perm_cache,
3035                          slp_compat_nodes_map_t *compat_cache)
3036 {
3037   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3038   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3039
3040   if (dump_enabled_p ())
3041     dump_printf_loc (MSG_NOTE, vect_location,
3042                      "Analyzing SLP tree %p for patterns\n",
3043                      (void *) SLP_INSTANCE_TREE (instance));
3044
3045   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3046                                     visited);
3047 }
3048
3049 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3050    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3051    Return true if we could use IFN_STORE_LANES instead and if that appears
3052    to be the better approach.  */
3053
3054 static bool
3055 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3056                                unsigned int group_size,
3057                                unsigned int new_group_size)
3058 {
3059   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3060   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3061   if (!vectype)
3062     return false;
3063   /* Allow the split if one of the two new groups would operate on full
3064      vectors *within* rather than across one scalar loop iteration.
3065      This is purely a heuristic, but it should work well for group
3066      sizes of 3 and 4, where the possible splits are:
3067
3068        3->2+1:  OK if the vector has exactly two elements
3069        4->2+2:  Likewise
3070        4->3+1:  Less clear-cut.  */
3071   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3072       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3073     return false;
3074   return vect_store_lanes_supported (vectype, group_size, false);
3075 }
3076
3077 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3078    vect_build_slp_tree to build a tree of packed stmts if possible.
3079    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3080
3081 static bool
3082 vect_analyze_slp_instance (vec_info *vinfo,
3083                            scalar_stmts_to_slp_tree_map_t *bst_map,
3084                            stmt_vec_info stmt_info, slp_instance_kind kind,
3085                            unsigned max_tree_size, unsigned *limit);
3086
3087 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3088    of KIND.  Return true if successful.  */
3089
3090 static bool
3091 vect_build_slp_instance (vec_info *vinfo,
3092                          slp_instance_kind kind,
3093                          vec<stmt_vec_info> &scalar_stmts,
3094                          vec<stmt_vec_info> &root_stmt_infos,
3095                          unsigned max_tree_size, unsigned *limit,
3096                          scalar_stmts_to_slp_tree_map_t *bst_map,
3097                          /* ???  We need stmt_info for group splitting.  */
3098                          stmt_vec_info stmt_info_)
3099 {
3100   if (dump_enabled_p ())
3101     {
3102       dump_printf_loc (MSG_NOTE, vect_location,
3103                        "Starting SLP discovery for\n");
3104       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3105         dump_printf_loc (MSG_NOTE, vect_location,
3106                          "  %G", scalar_stmts[i]->stmt);
3107     }
3108
3109   /* Build the tree for the SLP instance.  */
3110   unsigned int group_size = scalar_stmts.length ();
3111   bool *matches = XALLOCAVEC (bool, group_size);
3112   poly_uint64 max_nunits = 1;
3113   unsigned tree_size = 0;
3114   unsigned i;
3115   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3116                                        &max_nunits, matches, limit,
3117                                        &tree_size, bst_map);
3118   if (node != NULL)
3119     {
3120       /* Calculate the unrolling factor based on the smallest type.  */
3121       poly_uint64 unrolling_factor
3122         = calculate_unrolling_factor (max_nunits, group_size);
3123
3124       if (maybe_ne (unrolling_factor, 1U)
3125           && is_a <bb_vec_info> (vinfo))
3126         {
3127           unsigned HOST_WIDE_INT const_max_nunits;
3128           if (!max_nunits.is_constant (&const_max_nunits)
3129               || const_max_nunits > group_size)
3130             {
3131               if (dump_enabled_p ())
3132                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133                                  "Build SLP failed: store group "
3134                                  "size not a multiple of the vector size "
3135                                  "in basic block SLP\n");
3136               vect_free_slp_tree (node);
3137               return false;
3138             }
3139           /* Fatal mismatch.  */
3140           if (dump_enabled_p ())
3141             dump_printf_loc (MSG_NOTE, vect_location,
3142                              "SLP discovery succeeded but node needs "
3143                              "splitting\n");
3144           memset (matches, true, group_size);
3145           matches[group_size / const_max_nunits * const_max_nunits] = false;
3146           vect_free_slp_tree (node);
3147         }
3148       else
3149         {
3150           /* Create a new SLP instance.  */
3151           slp_instance new_instance = XNEW (class _slp_instance);
3152           SLP_INSTANCE_TREE (new_instance) = node;
3153           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3154           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3155           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3156           SLP_INSTANCE_KIND (new_instance) = kind;
3157           new_instance->reduc_phis = NULL;
3158           new_instance->cost_vec = vNULL;
3159           new_instance->subgraph_entries = vNULL;
3160
3161           if (dump_enabled_p ())
3162             dump_printf_loc (MSG_NOTE, vect_location,
3163                              "SLP size %u vs. limit %u.\n",
3164                              tree_size, max_tree_size);
3165
3166           /* Fixup SLP reduction chains.  */
3167           if (kind == slp_inst_kind_reduc_chain)
3168             {
3169               /* If this is a reduction chain with a conversion in front
3170                  amend the SLP tree with a node for that.  */
3171               gimple *scalar_def
3172                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3173               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3174                 {
3175                   /* Get at the conversion stmt - we know it's the single use
3176                      of the last stmt of the reduction chain.  */
3177                   use_operand_p use_p;
3178                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3179                                            &use_p, &scalar_def);
3180                   gcc_assert (r);
3181                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3182                   next_info = vect_stmt_to_vectorize (next_info);
3183                   scalar_stmts = vNULL;
3184                   scalar_stmts.create (group_size);
3185                   for (unsigned i = 0; i < group_size; ++i)
3186                     scalar_stmts.quick_push (next_info);
3187                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3188                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3189                   SLP_TREE_CHILDREN (conv).quick_push (node);
3190                   SLP_INSTANCE_TREE (new_instance) = conv;
3191                   /* We also have to fake this conversion stmt as SLP reduction
3192                      group so we don't have to mess with too much code
3193                      elsewhere.  */
3194                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3195                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3196                 }
3197               /* Fill the backedge child of the PHI SLP node.  The
3198                  general matching code cannot find it because the
3199                  scalar code does not reflect how we vectorize the
3200                  reduction.  */
3201               use_operand_p use_p;
3202               imm_use_iterator imm_iter;
3203               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3204               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3205                                      gimple_get_lhs (scalar_def))
3206                 /* There are exactly two non-debug uses, the reduction
3207                    PHI and the loop-closed PHI node.  */
3208                 if (!is_gimple_debug (USE_STMT (use_p))
3209                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3210                   {
3211                     auto_vec<stmt_vec_info, 64> phis (group_size);
3212                     stmt_vec_info phi_info
3213                       = vinfo->lookup_stmt (USE_STMT (use_p));
3214                     for (unsigned i = 0; i < group_size; ++i)
3215                       phis.quick_push (phi_info);
3216                     slp_tree *phi_node = bst_map->get (phis);
3217                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3218                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3219                       = SLP_INSTANCE_TREE (new_instance);
3220                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3221                   }
3222             }
3223
3224           vinfo->slp_instances.safe_push (new_instance);
3225
3226           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3227              the number of scalar stmts in the root in a few places.
3228              Verify that assumption holds.  */
3229           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3230                         .length () == group_size);
3231
3232           if (dump_enabled_p ())
3233             {
3234               dump_printf_loc (MSG_NOTE, vect_location,
3235                                "Final SLP tree for instance %p:\n",
3236                                (void *) new_instance);
3237               vect_print_slp_graph (MSG_NOTE, vect_location,
3238                                     SLP_INSTANCE_TREE (new_instance));
3239             }
3240
3241           return true;
3242         }
3243     }
3244   else
3245     {
3246       /* Failed to SLP.  */
3247       /* Free the allocated memory.  */
3248       scalar_stmts.release ();
3249     }
3250
3251   stmt_vec_info stmt_info = stmt_info_;
3252   /* Try to break the group up into pieces.  */
3253   if (kind == slp_inst_kind_store)
3254     {
3255       /* ???  We could delay all the actual splitting of store-groups
3256          until after SLP discovery of the original group completed.
3257          Then we can recurse to vect_build_slp_instance directly.  */
3258       for (i = 0; i < group_size; i++)
3259         if (!matches[i])
3260           break;
3261
3262       /* For basic block SLP, try to break the group up into multiples of
3263          a vector size.  */
3264       if (is_a <bb_vec_info> (vinfo)
3265           && (i > 1 && i < group_size))
3266         {
3267           tree scalar_type
3268             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3269           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3270                                                       1 << floor_log2 (i));
3271           unsigned HOST_WIDE_INT const_nunits;
3272           if (vectype
3273               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3274             {
3275               /* Split into two groups at the first vector boundary.  */
3276               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3277               unsigned group1_size = i & ~(const_nunits - 1);
3278
3279               if (dump_enabled_p ())
3280                 dump_printf_loc (MSG_NOTE, vect_location,
3281                                  "Splitting SLP group at stmt %u\n", i);
3282               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3283                                                                group1_size);
3284               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3285                                                     kind, max_tree_size,
3286                                                     limit);
3287               /* Split the rest at the failure point and possibly
3288                  re-analyze the remaining matching part if it has
3289                  at least two lanes.  */
3290               if (group1_size < i
3291                   && (i + 1 < group_size
3292                       || i - group1_size > 1))
3293                 {
3294                   stmt_vec_info rest2 = rest;
3295                   rest = vect_split_slp_store_group (rest, i - group1_size);
3296                   if (i - group1_size > 1)
3297                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3298                                                       kind, max_tree_size,
3299                                                       limit);
3300                 }
3301               /* Re-analyze the non-matching tail if it has at least
3302                  two lanes.  */
3303               if (i + 1 < group_size)
3304                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3305                                                   rest, kind, max_tree_size,
3306                                                   limit);
3307               return res;
3308             }
3309         }
3310
3311       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3312       if (is_a <loop_vec_info> (vinfo)
3313           && (i > 1 && i < group_size)
3314           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3315         {
3316           unsigned group1_size = i;
3317
3318           if (dump_enabled_p ())
3319             dump_printf_loc (MSG_NOTE, vect_location,
3320                              "Splitting SLP group at stmt %u\n", i);
3321
3322           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3323                                                            group1_size);
3324           /* Loop vectorization cannot handle gaps in stores, make sure
3325              the split group appears as strided.  */
3326           STMT_VINFO_STRIDED_P (rest) = 1;
3327           DR_GROUP_GAP (rest) = 0;
3328           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3329           DR_GROUP_GAP (stmt_info) = 0;
3330
3331           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3332                                                 kind, max_tree_size, limit);
3333           if (i + 1 < group_size)
3334             res |= vect_analyze_slp_instance (vinfo, bst_map,
3335                                               rest, kind, max_tree_size, limit);
3336
3337           return res;
3338         }
3339
3340       /* Even though the first vector did not all match, we might be able to SLP
3341          (some) of the remainder.  FORNOW ignore this possibility.  */
3342     }
3343
3344   /* Failed to SLP.  */
3345   if (dump_enabled_p ())
3346     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3347   return false;
3348 }
3349
3350
3351 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3352    vect_build_slp_tree to build a tree of packed stmts if possible.
3353    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3354
3355 static bool
3356 vect_analyze_slp_instance (vec_info *vinfo,
3357                            scalar_stmts_to_slp_tree_map_t *bst_map,
3358                            stmt_vec_info stmt_info,
3359                            slp_instance_kind kind,
3360                            unsigned max_tree_size, unsigned *limit)
3361 {
3362   unsigned int i;
3363   vec<stmt_vec_info> scalar_stmts;
3364
3365   if (is_a <bb_vec_info> (vinfo))
3366     vect_location = stmt_info->stmt;
3367
3368   stmt_vec_info next_info = stmt_info;
3369   if (kind == slp_inst_kind_store)
3370     {
3371       /* Collect the stores and store them in scalar_stmts.  */
3372       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3373       while (next_info)
3374         {
3375           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3376           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3377         }
3378     }
3379   else if (kind == slp_inst_kind_reduc_chain)
3380     {
3381       /* Collect the reduction stmts and store them in scalar_stmts.  */
3382       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3383       while (next_info)
3384         {
3385           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3386           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3387         }
3388       /* Mark the first element of the reduction chain as reduction to properly
3389          transform the node.  In the reduction analysis phase only the last
3390          element of the chain is marked as reduction.  */
3391       STMT_VINFO_DEF_TYPE (stmt_info)
3392         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3393       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3394         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3395     }
3396   else if (kind == slp_inst_kind_ctor)
3397     {
3398       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3399       tree val;
3400       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3401       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3402         {
3403           stmt_vec_info def_info = vinfo->lookup_def (val);
3404           def_info = vect_stmt_to_vectorize (def_info);
3405           scalar_stmts.quick_push (def_info);
3406         }
3407       if (dump_enabled_p ())
3408         dump_printf_loc (MSG_NOTE, vect_location,
3409                          "Analyzing vectorizable constructor: %G\n",
3410                          stmt_info->stmt);
3411     }
3412   else if (kind == slp_inst_kind_reduc_group)
3413     {
3414       /* Collect reduction statements.  */
3415       const vec<stmt_vec_info> &reductions
3416         = as_a <loop_vec_info> (vinfo)->reductions;
3417       scalar_stmts.create (reductions.length ());
3418       for (i = 0; reductions.iterate (i, &next_info); i++)
3419         if ((STMT_VINFO_RELEVANT_P (next_info)
3420              || STMT_VINFO_LIVE_P (next_info))
3421             /* ???  Make sure we didn't skip a conversion around a reduction
3422                path.  In that case we'd have to reverse engineer that conversion
3423                stmt following the chain using reduc_idx and from the PHI
3424                using reduc_def.  */
3425             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3426           scalar_stmts.quick_push (next_info);
3427       /* If less than two were relevant/live there's nothing to SLP.  */
3428       if (scalar_stmts.length () < 2)
3429         return false;
3430     }
3431   else
3432     gcc_unreachable ();
3433
3434   vec<stmt_vec_info> roots = vNULL;
3435   if (kind == slp_inst_kind_ctor)
3436     {
3437       roots.create (1);
3438       roots.quick_push (stmt_info);
3439     }
3440   /* Build the tree for the SLP instance.  */
3441   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3442                                       roots,
3443                                       max_tree_size, limit, bst_map,
3444                                       kind == slp_inst_kind_store
3445                                       ? stmt_info : NULL);
3446   if (!res)
3447     roots.release ();
3448
3449   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3450      where we should do store group splitting.  */
3451
3452   return res;
3453 }
3454
3455 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3456    trees of packed scalar stmts if SLP is possible.  */
3457
3458 opt_result
3459 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3460 {
3461   unsigned int i;
3462   stmt_vec_info first_element;
3463   slp_instance instance;
3464
3465   DUMP_VECT_SCOPE ("vect_analyze_slp");
3466
3467   unsigned limit = max_tree_size;
3468
3469   scalar_stmts_to_slp_tree_map_t *bst_map
3470     = new scalar_stmts_to_slp_tree_map_t ();
3471
3472   /* Find SLP sequences starting from groups of grouped stores.  */
3473   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3474     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3475                                STMT_VINFO_GROUPED_ACCESS (first_element)
3476                                ? slp_inst_kind_store : slp_inst_kind_ctor,
3477                                max_tree_size, &limit);
3478
3479   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3480     {
3481       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3482         {
3483           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3484           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3485                                        bb_vinfo->roots[i].stmts,
3486                                        bb_vinfo->roots[i].roots,
3487                                        max_tree_size, &limit, bst_map, NULL))
3488             {
3489               bb_vinfo->roots[i].stmts = vNULL;
3490               bb_vinfo->roots[i].roots = vNULL;
3491             }
3492         }
3493     }
3494
3495   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3496     {
3497       /* Find SLP sequences starting from reduction chains.  */
3498       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3499         if (! STMT_VINFO_RELEVANT_P (first_element)
3500             && ! STMT_VINFO_LIVE_P (first_element))
3501           ;
3502         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3503                                               slp_inst_kind_reduc_chain,
3504                                               max_tree_size, &limit))
3505           {
3506             /* Dissolve reduction chain group.  */
3507             stmt_vec_info vinfo = first_element;
3508             stmt_vec_info last = NULL;
3509             while (vinfo)
3510               {
3511                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3512                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3513                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3514                 last = vinfo;
3515                 vinfo = next;
3516               }
3517             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3518             /* It can be still vectorized as part of an SLP reduction.  */
3519             loop_vinfo->reductions.safe_push (last);
3520           }
3521
3522       /* Find SLP sequences starting from groups of reductions.  */
3523       if (loop_vinfo->reductions.length () > 1)
3524         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3525                                    slp_inst_kind_reduc_group, max_tree_size,
3526                                    &limit);
3527     }
3528
3529   hash_set<slp_tree> visited_patterns;
3530   slp_tree_to_load_perm_map_t perm_cache;
3531   slp_compat_nodes_map_t compat_cache;
3532
3533   /* See if any patterns can be found in the SLP tree.  */
3534   bool pattern_found = false;
3535   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3536     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3537                                               &visited_patterns, &perm_cache,
3538                                               &compat_cache);
3539
3540   /* If any were found optimize permutations of loads.  */
3541   if (pattern_found)
3542     {
3543       hash_map<slp_tree, slp_tree> load_map;
3544       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3545         {
3546           slp_tree root = SLP_INSTANCE_TREE (instance);
3547           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3548                                         &load_map, root);
3549         }
3550     }
3551
3552
3553
3554   /* The map keeps a reference on SLP nodes built, release that.  */
3555   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3556        it != bst_map->end (); ++it)
3557     if ((*it).second)
3558       vect_free_slp_tree ((*it).second);
3559   delete bst_map;
3560
3561   if (pattern_found && dump_enabled_p ())
3562     {
3563       dump_printf_loc (MSG_NOTE, vect_location,
3564                        "Pattern matched SLP tree\n");
3565       hash_set<slp_tree> visited;
3566       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3567         vect_print_slp_graph (MSG_NOTE, vect_location,
3568                               SLP_INSTANCE_TREE (instance), visited);
3569     }
3570
3571   return opt_result::success ();
3572 }
3573
3574 /* Estimates the cost of inserting layout changes into the SLP graph.
3575    It can also say that the insertion is impossible.  */
3576
3577 struct slpg_layout_cost
3578 {
3579   slpg_layout_cost () = default;
3580   slpg_layout_cost (sreal, bool);
3581
3582   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3583   bool is_possible () const { return depth != sreal::max (); }
3584
3585   bool operator== (const slpg_layout_cost &) const;
3586   bool operator!= (const slpg_layout_cost &) const;
3587
3588   bool is_better_than (const slpg_layout_cost &, bool) const;
3589
3590   void add_parallel_cost (const slpg_layout_cost &);
3591   void add_serial_cost (const slpg_layout_cost &);
3592   void split (unsigned int);
3593
3594   /* The longest sequence of layout changes needed during any traversal
3595      of the partition dag, weighted by execution frequency.
3596
3597      This is the most important metric when optimizing for speed, since
3598      it helps to ensure that we keep the number of operations on
3599      critical paths to a minimum.  */
3600   sreal depth = 0;
3601
3602   /* An estimate of the total number of operations needed.  It is weighted by
3603      execution frequency when optimizing for speed but not when optimizing for
3604      size.  In order to avoid double-counting, a node with a fanout of N will
3605      distribute 1/N of its total cost to each successor.
3606
3607      This is the most important metric when optimizing for size, since
3608      it helps to keep the total number of operations to a minimum,  */
3609   sreal total = 0;
3610 };
3611
3612 /* Construct costs for a node with weight WEIGHT.  A higher weight
3613    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3614    optimizing for size rather than speed.  */
3615
3616 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3617   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3618 {
3619 }
3620
3621 bool
3622 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3623 {
3624   return depth == other.depth && total == other.total;
3625 }
3626
3627 bool
3628 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3629 {
3630   return !operator== (other);
3631 }
3632
3633 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3634    true if we are optimizing for size rather than speed.  */
3635
3636 bool
3637 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3638                                   bool is_for_size) const
3639 {
3640   if (is_for_size)
3641     {
3642       if (total != other.total)
3643         return total < other.total;
3644       return depth < other.depth;
3645     }
3646   else
3647     {
3648       if (depth != other.depth)
3649         return depth < other.depth;
3650       return total < other.total;
3651     }
3652 }
3653
3654 /* Increase the costs to account for something with cost INPUT_COST
3655    happening in parallel with the current costs.  */
3656
3657 void
3658 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3659 {
3660   depth = std::max (depth, input_cost.depth);
3661   total += input_cost.total;
3662 }
3663
3664 /* Increase the costs to account for something with cost INPUT_COST
3665    happening in series with the current costs.  */
3666
3667 void
3668 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3669 {
3670   depth += other.depth;
3671   total += other.total;
3672 }
3673
3674 /* Split the total cost among TIMES successors or predecessors.  */
3675
3676 void
3677 slpg_layout_cost::split (unsigned int times)
3678 {
3679   if (times > 1)
3680     total /= times;
3681 }
3682
3683 /* Information about one node in the SLP graph, for use during
3684    vect_optimize_slp_pass.  */
3685
3686 struct slpg_vertex
3687 {
3688   slpg_vertex (slp_tree node_) : node (node_) {}
3689
3690   /* The node itself.  */
3691   slp_tree node;
3692
3693   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3694      partitions are flexible; they can have whichever layout consumers
3695      want them to have.  */
3696   int partition = -1;
3697
3698   /* The number of nodes that directly use the result of this one
3699      (i.e. the number of nodes that count this one as a child).  */
3700   unsigned int out_degree = 0;
3701
3702   /* The execution frequency of the node.  */
3703   sreal weight = 0;
3704
3705   /* The total execution frequency of all nodes that directly use the
3706      result of this one.  */
3707   sreal out_weight = 0;
3708 };
3709
3710 /* Information about one partition of the SLP graph, for use during
3711    vect_optimize_slp_pass.  */
3712
3713 struct slpg_partition_info
3714 {
3715   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3716      of m_partitioned_nodes.  */
3717   unsigned int node_begin = 0;
3718   unsigned int node_end = 0;
3719
3720   /* Which layout we've chosen to use for this partition, or -1 if
3721      we haven't picked one yet.  */
3722   int layout = -1;
3723
3724   /* The number of predecessors and successors in the partition dag.
3725      The predecessors always have lower partition numbers and the
3726      successors always have higher partition numbers.
3727
3728      Note that the directions of these edges are not necessarily the
3729      same as in the data flow graph.  For example, if an SCC has separate
3730      partitions for an inner loop and an outer loop, the inner loop's
3731      partition will have at least two incoming edges from the outer loop's
3732      partition: one for a live-in value and one for a live-out value.
3733      In data flow terms, one of these edges would also be from the outer loop
3734      to the inner loop, but the other would be in the opposite direction.  */
3735   unsigned int in_degree = 0;
3736   unsigned int out_degree = 0;
3737 };
3738
3739 /* Information about the costs of using a particular layout for a
3740    particular partition.  It can also say that the combination is
3741    impossible.  */
3742
3743 struct slpg_partition_layout_costs
3744 {
3745   bool is_possible () const { return internal_cost.is_possible (); }
3746   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3747
3748   /* The costs inherited from predecessor partitions.  */
3749   slpg_layout_cost in_cost;
3750
3751   /* The inherent cost of the layout within the node itself.  For example,
3752      this is nonzero for a load if choosing a particular layout would require
3753      the load to permute the loaded elements.  It is nonzero for a
3754      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3755      to full-vector moves.  */
3756   slpg_layout_cost internal_cost;
3757
3758   /* The costs inherited from successor partitions.  */
3759   slpg_layout_cost out_cost;
3760 };
3761
3762 /* This class tries to optimize the layout of vectors in order to avoid
3763    unnecessary shuffling.  At the moment, the set of possible layouts are
3764    restricted to bijective permutations.
3765
3766    The goal of the pass depends on whether we're optimizing for size or
3767    for speed.  When optimizing for size, the goal is to reduce the overall
3768    number of layout changes (including layout changes implied by things
3769    like load permutations).  When optimizing for speed, the goal is to
3770    reduce the maximum latency attributable to layout changes on any
3771    non-cyclical path through the data flow graph.
3772
3773    For example, when optimizing a loop nest for speed, we will prefer
3774    to make layout changes outside of a loop rather than inside of a loop,
3775    and will prefer to make layout changes in parallel rather than serially,
3776    even if that increases the overall number of layout changes.
3777
3778    The high-level procedure is:
3779
3780    (1) Build a graph in which edges go from uses (parents) to definitions
3781        (children).
3782
3783    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3784
3785    (3) When optimizing for speed, partition the nodes in each SCC based
3786        on their containing cfg loop.  When optimizing for size, treat
3787        each SCC as a single partition.
3788
3789        This gives us a dag of partitions.  The goal is now to assign a
3790        layout to each partition.
3791
3792    (4) Construct a set of vector layouts that are worth considering.
3793        Record which nodes must keep their current layout.
3794
3795    (5) Perform a forward walk over the partition dag (from loads to stores)
3796        accumulating the "forward" cost of using each layout.  When visiting
3797        each partition, assign a tentative choice of layout to the partition
3798        and use that choice when calculating the cost of using a different
3799        layout in successor partitions.
3800
3801    (6) Perform a backward walk over the partition dag (from stores to loads),
3802        accumulating the "backward" cost of using each layout.  When visiting
3803        each partition, make a final choice of layout for that partition based
3804        on the accumulated forward costs (from (5)) and backward costs
3805        (from (6)).
3806
3807    (7) Apply the chosen layouts to the SLP graph.
3808
3809    For example, consider the SLP statements:
3810
3811    S1:      a_1 = load
3812        loop:
3813    S2:      a_2 = PHI<a_1, a_3>
3814    S3:      b_1 = load
3815    S4:      a_3 = a_2 + b_1
3816        exit:
3817    S5:      a_4 = PHI<a_3>
3818    S6:      store a_4
3819
3820    S2 and S4 form an SCC and are part of the same loop.  Every other
3821    statement is in a singleton SCC.  In this example there is a one-to-one
3822    mapping between SCCs and partitions and the partition dag looks like this;
3823
3824         S1     S3
3825          \     /
3826           S2+S4
3827             |
3828            S5
3829             |
3830            S6
3831
3832    S2, S3 and S4 will have a higher execution frequency than the other
3833    statements, so when optimizing for speed, the goal is to avoid any
3834    layout changes:
3835
3836    - within S3
3837    - within S2+S4
3838    - on the S3->S2+S4 edge
3839
3840    For example, if S3 was originally a reversing load, the goal of the
3841    pass is to make it an unreversed load and change the layout on the
3842    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3843    on S1->S2+S4 and S5->S6 would also be acceptable.)
3844
3845    The difference between SCCs and partitions becomes important if we
3846    add an outer loop:
3847
3848    S1:      a_1 = ...
3849        loop1:
3850    S2:      a_2 = PHI<a_1, a_6>
3851    S3:      b_1 = load
3852    S4:      a_3 = a_2 + b_1
3853        loop2:
3854    S5:      a_4 = PHI<a_3, a_5>
3855    S6:      c_1 = load
3856    S7:      a_5 = a_4 + c_1
3857        exit2:
3858    S8:      a_6 = PHI<a_5>
3859    S9:      store a_6
3860        exit1:
3861
3862    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3863    for speed, we usually do not want restrictions in the outer loop to "infect"
3864    the decision for the inner loop.  For example, if an outer-loop node
3865    in the SCC contains a statement with a fixed layout, that should not
3866    prevent the inner loop from using a different layout.  Conversely,
3867    the inner loop should not dictate a layout to the outer loop: if the
3868    outer loop does a lot of computation, then it may not be efficient to
3869    do all of that computation in the inner loop's preferred layout.
3870
3871    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3872    and S5+S7 (inner).  We also try to arrange partitions so that:
3873
3874    - the partition for an outer loop comes before the partition for
3875      an inner loop
3876
3877    - if a sibling loop A dominates a sibling loop B, A's partition
3878      comes before B's
3879
3880    This gives the following partition dag for the example above:
3881
3882         S1        S3
3883          \        /
3884           S2+S4+S8   S6
3885            |   \\    /
3886            |    S5+S7
3887            |
3888           S9
3889
3890    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3891    one for a reversal of the edge S7->S8.
3892
3893    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3894    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3895    preferred layout against the cost of changing the layout on entry to the
3896    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3897
3898    Although this works well when optimizing for speed, it has the downside
3899    when optimizing for size that the choice of layout for S5+S7 is completely
3900    independent of S9, which lessens the chance of reducing the overall number
3901    of permutations.  We therefore do not partition SCCs when optimizing
3902    for size.
3903
3904    To give a concrete example of the difference between optimizing
3905    for size and speed, consider:
3906
3907    a[0] = (b[1] << c[3]) - d[1];
3908    a[1] = (b[0] << c[2]) - d[0];
3909    a[2] = (b[3] << c[1]) - d[3];
3910    a[3] = (b[2] << c[0]) - d[2];
3911
3912    There are three different layouts here: one for a, one for b and d,
3913    and one for c.  When optimizing for speed it is better to permute each
3914    of b, c and d into the order required by a, since those permutations
3915    happen in parallel.  But when optimizing for size, it is better to:
3916
3917    - permute c into the same order as b
3918    - do the arithmetic
3919    - permute the result into the order required by a
3920
3921    This gives 2 permutations rather than 3.  */
3922
3923 class vect_optimize_slp_pass
3924 {
3925 public:
3926   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3927   void run ();
3928
3929 private:
3930   /* Graph building.  */
3931   struct loop *containing_loop (slp_tree);
3932   bool is_cfg_latch_edge (graph_edge *);
3933   void build_vertices (hash_set<slp_tree> &, slp_tree);
3934   void build_vertices ();
3935   void build_graph ();
3936
3937   /* Partitioning.  */
3938   void create_partitions ();
3939   template<typename T> void for_each_partition_edge (unsigned int, T);
3940
3941   /* Layout selection.  */
3942   bool is_compatible_layout (slp_tree, unsigned int);
3943   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3944   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3945                                                        unsigned int);
3946   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3947                                int, unsigned int);
3948   int internal_node_cost (slp_tree, int, unsigned int);
3949   void start_choosing_layouts ();
3950
3951   /* Cost propagation.  */
3952   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3953                                      unsigned int, unsigned int);
3954   slpg_layout_cost total_in_cost (unsigned int);
3955   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3956   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3957   void forward_pass ();
3958   void backward_pass ();
3959
3960   /* Rematerialization.  */
3961   slp_tree get_result_with_layout (slp_tree, unsigned int);
3962   void materialize ();
3963
3964   /* Clean-up.  */
3965   void remove_redundant_permutations ();
3966
3967   void dump ();
3968
3969   vec_info *m_vinfo;
3970
3971   /* True if we should optimize the graph for size, false if we should
3972      optimize it for speed.  (It wouldn't be easy to make this decision
3973      more locally.)  */
3974   bool m_optimize_size;
3975
3976   /* A graph of all SLP nodes, with edges leading from uses to definitions.
3977      In other words, a node's predecessors are its slp_tree parents and
3978      a node's successors are its slp_tree children.  */
3979   graph *m_slpg = nullptr;
3980
3981   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
3982   auto_vec<slpg_vertex> m_vertices;
3983
3984   /* The list of all leaves of M_SLPG. such as external definitions, constants,
3985      and loads.  */
3986   auto_vec<int> m_leafs;
3987
3988   /* This array has one entry for every vector layout that we're considering.
3989      Element 0 is null and indicates "no change".  Other entries describe
3990      permutations that are inherent in the current graph and that we would
3991      like to reverse if possible.
3992
3993      For example, a permutation { 1, 2, 3, 0 } means that something has
3994      effectively been permuted in that way, such as a load group
3995      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3996      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3997      in order to put things "back" in order.  */
3998   auto_vec<vec<unsigned> > m_perms;
3999
4000   /* A partitioning of the nodes for which a layout must be chosen.
4001      Each partition represents an <SCC, cfg loop> pair; that is,
4002      nodes in different SCCs belong to different partitions, and nodes
4003      within an SCC can be further partitioned according to a containing
4004      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4005
4006      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4007        from leaves (such as loads) to roots (such as stores).
4008
4009      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4010   auto_vec<slpg_partition_info> m_partitions;
4011
4012   /* The list of all nodes for which a layout must be chosen.  Nodes for
4013      partition P come before the nodes for partition P+1.  Nodes within a
4014      partition are in reverse postorder.  */
4015   auto_vec<unsigned int> m_partitioned_nodes;
4016
4017   /* Index P * num-layouts + L contains the cost of using layout L
4018      for partition P.  */
4019   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4020
4021   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4022      original output of node N adjusted to have layout L.  */
4023   auto_vec<slp_tree> m_node_layouts;
4024 };
4025
4026 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4027    Also record whether we should optimize anything for speed rather
4028    than size.  */
4029
4030 void
4031 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4032                                         slp_tree node)
4033 {
4034   unsigned i;
4035   slp_tree child;
4036
4037   if (visited.add (node))
4038     return;
4039
4040   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4041     {
4042       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4043       if (optimize_bb_for_speed_p (bb))
4044         m_optimize_size = false;
4045     }
4046
4047   node->vertex = m_vertices.length ();
4048   m_vertices.safe_push (slpg_vertex (node));
4049
4050   bool leaf = true;
4051   bool force_leaf = false;
4052   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4053     if (child)
4054       {
4055         leaf = false;
4056         build_vertices (visited, child);
4057       }
4058     else
4059       force_leaf = true;
4060   /* Since SLP discovery works along use-def edges all cycles have an
4061      entry - but there's the exception of cycles where we do not handle
4062      the entry explicitely (but with a NULL SLP node), like some reductions
4063      and inductions.  Force those SLP PHIs to act as leafs to make them
4064      backwards reachable.  */
4065   if (leaf || force_leaf)
4066     m_leafs.safe_push (node->vertex);
4067 }
4068
4069 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4070
4071 void
4072 vect_optimize_slp_pass::build_vertices ()
4073 {
4074   hash_set<slp_tree> visited;
4075   unsigned i;
4076   slp_instance instance;
4077   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4078     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4079 }
4080
4081 /* Apply (reverse) bijectite PERM to VEC.  */
4082
4083 template <class T>
4084 static void
4085 vect_slp_permute (vec<unsigned> perm,
4086                   vec<T> &vec, bool reverse)
4087 {
4088   auto_vec<T, 64> saved;
4089   saved.create (vec.length ());
4090   for (unsigned i = 0; i < vec.length (); ++i)
4091     saved.quick_push (vec[i]);
4092
4093   if (reverse)
4094     {
4095       for (unsigned i = 0; i < vec.length (); ++i)
4096         vec[perm[i]] = saved[i];
4097       for (unsigned i = 0; i < vec.length (); ++i)
4098         gcc_assert (vec[perm[i]] == saved[i]);
4099     }
4100   else
4101     {
4102       for (unsigned i = 0; i < vec.length (); ++i)
4103         vec[i] = saved[perm[i]];
4104       for (unsigned i = 0; i < vec.length (); ++i)
4105         gcc_assert (vec[i] == saved[perm[i]]);
4106     }
4107 }
4108
4109 /* Return the cfg loop that contains NODE.  */
4110
4111 struct loop *
4112 vect_optimize_slp_pass::containing_loop (slp_tree node)
4113 {
4114   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4115   if (!rep)
4116     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4117   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4118 }
4119
4120 /* Return true if UD (an edge from a use to a definition) is associated
4121    with a loop latch edge in the cfg.  */
4122
4123 bool
4124 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4125 {
4126   slp_tree use = m_vertices[ud->src].node;
4127   slp_tree def = m_vertices[ud->dest].node;
4128   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4129       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4130     return false;
4131
4132   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4133   return (is_a<gphi *> (use_rep->stmt)
4134           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4135           && containing_loop (def) == containing_loop (use));
4136 }
4137
4138 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4139    a nonnull data field.  */
4140
4141 void
4142 vect_optimize_slp_pass::build_graph ()
4143 {
4144   m_optimize_size = true;
4145   build_vertices ();
4146
4147   m_slpg = new_graph (m_vertices.length ());
4148   for (slpg_vertex &v : m_vertices)
4149     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4150       if (child)
4151         {
4152           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4153           if (is_cfg_latch_edge (ud))
4154             ud->data = this;
4155         }
4156 }
4157
4158 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4159
4160 static bool
4161 skip_cfg_latch_edges (graph_edge *e)
4162 {
4163   return e->data;
4164 }
4165
4166 /* Create the node partitions.  */
4167
4168 void
4169 vect_optimize_slp_pass::create_partitions ()
4170 {
4171   /* Calculate a postorder of the graph, ignoring edges that correspond
4172      to natural latch edges in the cfg.  Reading the vector from the end
4173      to the beginning gives the reverse postorder.  */
4174   auto_vec<int> initial_rpo;
4175   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4176                false, NULL, skip_cfg_latch_edges);
4177   gcc_assert (initial_rpo.length () == m_vertices.length ());
4178
4179   /* Calculate the strongly connected components of the graph.  */
4180   auto_vec<int> scc_grouping;
4181   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4182
4183   /* Create a new index order in which all nodes from the same SCC are
4184      consecutive.  Use scc_pos to record the index of the first node in
4185      each SCC.  */
4186   auto_vec<unsigned int> scc_pos (num_sccs);
4187   int last_component = -1;
4188   unsigned int node_count = 0;
4189   for (unsigned int node_i : scc_grouping)
4190     {
4191       if (last_component != m_slpg->vertices[node_i].component)
4192         {
4193           last_component = m_slpg->vertices[node_i].component;
4194           gcc_assert (last_component == int (scc_pos.length ()));
4195           scc_pos.quick_push (node_count);
4196         }
4197       node_count += 1;
4198     }
4199   gcc_assert (node_count == initial_rpo.length ()
4200               && last_component + 1 == int (num_sccs));
4201
4202   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4203      inside each SCC following the RPO we calculated above.  The fact that
4204      we ignored natural latch edges when calculating the RPO should ensure
4205      that, for natural loop nests:
4206
4207      - the first node that we encounter in a cfg loop is the loop header phi
4208      - the loop header phis are in dominance order
4209
4210      Arranging for this is an optimization (see below) rather than a
4211      correctness issue.  Unnatural loops with a tangled mess of backedges
4212      will still work correctly, but might give poorer results.
4213
4214      Also update scc_pos so that it gives 1 + the index of the last node
4215      in the SCC.  */
4216   m_partitioned_nodes.safe_grow (node_count);
4217   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4218     {
4219       unsigned int node_i = initial_rpo[old_i];
4220       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4221       m_partitioned_nodes[new_i] = node_i;
4222     }
4223
4224   /* When optimizing for speed, partition each SCC based on the containing
4225      cfg loop. The order we constructed above should ensure that, for natural
4226      cfg loops, we'll create sub-SCC partitions for outer loops before
4227      the corresponding sub-SCC partitions for inner loops.  Similarly,
4228      when one sibling loop A dominates another sibling loop B, we should
4229      create a sub-SCC partition for A before a sub-SCC partition for B.
4230
4231      As above, nothing depends for correctness on whether this achieves
4232      a natural nesting, but we should get better results when it does.  */
4233   m_partitions.reserve (m_vertices.length ());
4234   unsigned int next_partition_i = 0;
4235   hash_map<struct loop *, int> loop_partitions;
4236   unsigned int rpo_begin = 0;
4237   unsigned int num_partitioned_nodes = 0;
4238   for (unsigned int rpo_end : scc_pos)
4239     {
4240       loop_partitions.empty ();
4241       unsigned int partition_i = next_partition_i;
4242       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4243         {
4244           /* Handle externals and constants optimistically throughout.
4245              But treat existing vectors as fixed since we do not handle
4246              permuting them.  */
4247           unsigned int node_i = m_partitioned_nodes[rpo_i];
4248           auto &vertex = m_vertices[node_i];
4249           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4250                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4251               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4252             vertex.partition = -1;
4253           else
4254             {
4255               bool existed;
4256               if (m_optimize_size)
4257                 existed = next_partition_i > partition_i;
4258               else
4259                 {
4260                   struct loop *loop = containing_loop (vertex.node);
4261                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4262                   if (!existed)
4263                     entry = next_partition_i;
4264                   partition_i = entry;
4265                 }
4266               if (!existed)
4267                 {
4268                   m_partitions.quick_push (slpg_partition_info ());
4269                   next_partition_i += 1;
4270                 }
4271               vertex.partition = partition_i;
4272               num_partitioned_nodes += 1;
4273               m_partitions[partition_i].node_end += 1;
4274             }
4275         }
4276       rpo_begin = rpo_end;
4277     }
4278
4279   /* Assign ranges of consecutive node indices to each partition,
4280      in partition order.  Start with node_end being the same as
4281      node_begin so that the next loop can use it as a counter.  */
4282   unsigned int node_begin = 0;
4283   for (auto &partition : m_partitions)
4284     {
4285       partition.node_begin = node_begin;
4286       node_begin += partition.node_end;
4287       partition.node_end = partition.node_begin;
4288     }
4289   gcc_assert (node_begin == num_partitioned_nodes);
4290
4291   /* Finally build the list of nodes in partition order.  */
4292   m_partitioned_nodes.truncate (num_partitioned_nodes);
4293   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4294     {
4295       int partition_i = m_vertices[node_i].partition;
4296       if (partition_i >= 0)
4297         {
4298           unsigned int order_i = m_partitions[partition_i].node_end++;
4299           m_partitioned_nodes[order_i] = node_i;
4300         }
4301     }
4302 }
4303
4304 /* Look for edges from earlier partitions into node NODE_I and edges from
4305    node NODE_I into later partitions.  Call:
4306
4307       FN (ud, other_node_i)
4308
4309    for each such use-to-def edge ud, where other_node_i is the node at the
4310    other end of the edge.  */
4311
4312 template<typename T>
4313 void
4314 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4315 {
4316   int partition_i = m_vertices[node_i].partition;
4317   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4318        pred; pred = pred->pred_next)
4319     {
4320       int src_partition_i = m_vertices[pred->src].partition;
4321       if (src_partition_i >= 0 && src_partition_i != partition_i)
4322         fn (pred, pred->src);
4323     }
4324   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4325        succ; succ = succ->succ_next)
4326     {
4327       int dest_partition_i = m_vertices[succ->dest].partition;
4328       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4329         fn (succ, succ->dest);
4330     }
4331 }
4332
4333 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4334    that NODE would operate on.  This test is independent of NODE's actual
4335    operation.  */
4336
4337 bool
4338 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4339                                               unsigned int layout_i)
4340 {
4341   if (layout_i == 0)
4342     return true;
4343
4344   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4345     return false;
4346
4347   return true;
4348 }
4349
4350 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4351    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4352    layouts is incompatible with NODE or if the change is not possible for
4353    some other reason.
4354
4355    The properties taken from NODE include the number of lanes and the
4356    vector type.  The actual operation doesn't matter.  */
4357
4358 int
4359 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4360                                             unsigned int from_layout_i,
4361                                             unsigned int to_layout_i)
4362 {
4363   if (!is_compatible_layout (node, from_layout_i)
4364       || !is_compatible_layout (node, to_layout_i))
4365     return -1;
4366
4367   if (from_layout_i == to_layout_i)
4368     return 0;
4369
4370   auto_vec<slp_tree, 1> children (1);
4371   children.quick_push (node);
4372   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4373   if (from_layout_i > 0)
4374     for (unsigned int i : m_perms[from_layout_i])
4375       perm.quick_push ({ 0, i });
4376   else
4377     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4378       perm.quick_push ({ 0, i });
4379   if (to_layout_i > 0)
4380     vect_slp_permute (m_perms[to_layout_i], perm, true);
4381   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4382                                                children, false);
4383   if (count >= 0)
4384     return MAX (count, 1);
4385
4386   /* ??? In principle we could try changing via layout 0, giving two
4387      layout changes rather than 1.  Doing that would require
4388      corresponding support in get_result_with_layout.  */
4389   return -1;
4390 }
4391
4392 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4393
4394 inline slpg_partition_layout_costs &
4395 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4396                                                 unsigned int layout_i)
4397 {
4398   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4399 }
4400
4401 /* Change PERM in one of two ways:
4402
4403    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4404      chosen for child I of NODE.
4405
4406    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4407
4408    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4409
4410 void
4411 vect_optimize_slp_pass::
4412 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4413                         int in_layout_i, unsigned int out_layout_i)
4414 {
4415   for (auto &entry : perm)
4416     {
4417       int this_in_layout_i = in_layout_i;
4418       if (this_in_layout_i < 0)
4419         {
4420           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4421           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4422           this_in_layout_i = m_partitions[in_partition_i].layout;
4423         }
4424       if (this_in_layout_i > 0)
4425         entry.second = m_perms[this_in_layout_i][entry.second];
4426     }
4427   if (out_layout_i > 0)
4428     vect_slp_permute (m_perms[out_layout_i], perm, true);
4429 }
4430
4431 /* Check whether the target allows NODE to be rearranged so that the node's
4432    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4433    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4434
4435    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4436    NODE can adapt to the layout changes that have (perhaps provisionally)
4437    been chosen for NODE's children, so that no extra permutations are
4438    needed on either the input or the output of NODE.
4439
4440    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4441    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4442
4443    IN_LAYOUT_I has no meaning for other types of node.
4444
4445    Keeping the node as-is is always valid.  If the target doesn't appear
4446    to support the node as-is, but might realistically support other layouts,
4447    then layout 0 instead has the cost of a worst-case permutation.  On the
4448    one hand, this ensures that every node has at least one valid layout,
4449    avoiding what would otherwise be an awkward special case.  On the other,
4450    it still encourages the pass to change an invalid pre-existing layout
4451    choice into a valid one.  */
4452
4453 int
4454 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4455                                             unsigned int out_layout_i)
4456 {
4457   const int fallback_cost = 1;
4458
4459   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4460     {
4461       auto_lane_permutation_t tmp_perm;
4462       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4463
4464       /* Check that the child nodes support the chosen layout.  Checking
4465          the first child is enough, since any second child would have the
4466          same shape.  */
4467       auto first_child = SLP_TREE_CHILDREN (node)[0];
4468       if (in_layout_i > 0
4469           && !is_compatible_layout (first_child, in_layout_i))
4470         return -1;
4471
4472       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4473       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4474                                                   node, tmp_perm,
4475                                                   SLP_TREE_CHILDREN (node),
4476                                                   false);
4477       if (count < 0)
4478         {
4479           if (in_layout_i == 0 && out_layout_i == 0)
4480             {
4481               /* Use the fallback cost if the node could in principle support
4482                  some nonzero layout for both the inputs and the outputs.
4483                  Otherwise assume that the node will be rejected later
4484                  and rebuilt from scalars.  */
4485               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4486                 return fallback_cost;
4487               return 0;
4488             }
4489           return -1;
4490         }
4491
4492       /* We currently have no way of telling whether the new layout is cheaper
4493          or more expensive than the old one.  But at least in principle,
4494          it should be worth making zero permutations (whole-vector shuffles)
4495          cheaper than real permutations, in case the pass is able to remove
4496          the latter.  */
4497       return count == 0 ? 0 : 1;
4498     }
4499
4500   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4501   if (rep
4502       && STMT_VINFO_DATA_REF (rep)
4503       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4504       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4505     {
4506       auto_load_permutation_t tmp_perm;
4507       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4508       if (out_layout_i > 0)
4509         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4510
4511       poly_uint64 vf = 1;
4512       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4513         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4514       unsigned int n_perms;
4515       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4516                                            nullptr, vf, true, false, &n_perms))
4517         {
4518           auto rep = SLP_TREE_REPRESENTATIVE (node);
4519           if (out_layout_i == 0)
4520             {
4521               /* Use the fallback cost if the load is an N-to-N permutation.
4522                  Otherwise assume that the node will be rejected later
4523                  and rebuilt from scalars.  */
4524               if (STMT_VINFO_GROUPED_ACCESS (rep)
4525                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4526                       == SLP_TREE_LANES (node)))
4527                 return fallback_cost;
4528               return 0;
4529             }
4530           return -1;
4531         }
4532
4533       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4534       return n_perms == 0 ? 0 : 1;
4535     }
4536
4537   return 0;
4538 }
4539
4540 /* Decide which element layouts we should consider using.  Calculate the
4541    weights associated with inserting layout changes on partition edges.
4542    Also mark partitions that cannot change layout, by setting their
4543    layout to zero.  */
4544
4545 void
4546 vect_optimize_slp_pass::start_choosing_layouts ()
4547 {
4548   /* Used to assign unique permutation indices.  */
4549   using perm_hash = unbounded_hashmap_traits<
4550     vec_free_hash_base<int_hash_base<unsigned>>,
4551     int_hash<int, -1, -2>
4552   >;
4553   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4554
4555   /* Layout 0 is "no change".  */
4556   m_perms.safe_push (vNULL);
4557
4558   /* Create layouts from existing permutations.  */
4559   auto_load_permutation_t tmp_perm;
4560   for (unsigned int node_i : m_partitioned_nodes)
4561     {
4562       /* Leafs also double as entries to the reverse graph.  Allow the
4563          layout of those to be changed.  */
4564       auto &vertex = m_vertices[node_i];
4565       auto &partition = m_partitions[vertex.partition];
4566       if (!m_slpg->vertices[node_i].succ)
4567         partition.layout = 0;
4568
4569       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4570       slp_tree node = vertex.node;
4571       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4572       slp_tree child;
4573       unsigned HOST_WIDE_INT imin, imax = 0;
4574       bool any_permute = false;
4575       tmp_perm.truncate (0);
4576       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4577         {
4578           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4579              unpermuted, record a layout that reverses this permutation.
4580
4581              We would need more work to cope with loads that are internally
4582              permuted and also have inputs (such as masks for
4583              IFN_MASK_LOADs).  */
4584           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4585           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4586             continue;
4587           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4588           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4589           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4590         }
4591       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4592                && SLP_TREE_CHILDREN (node).length () == 1
4593                && (child = SLP_TREE_CHILDREN (node)[0])
4594                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4595                    .is_constant (&imin)))
4596         {
4597           /* If the child has the same vector size as this node,
4598              reversing the permutation can make the permutation a no-op.
4599              In other cases it can change a true permutation into a
4600              full-vector extract.  */
4601           tmp_perm.reserve (SLP_TREE_LANES (node));
4602           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4603             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4604         }
4605       else
4606         continue;
4607
4608       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4609         {
4610           unsigned idx = tmp_perm[j];
4611           imin = MIN (imin, idx);
4612           imax = MAX (imax, idx);
4613           if (idx - tmp_perm[0] != j)
4614             any_permute = true;
4615         }
4616       /* If the span doesn't match we'd disrupt VF computation, avoid
4617          that for now.  */
4618       if (imax - imin + 1 != SLP_TREE_LANES (node))
4619         continue;
4620       /* If there's no permute no need to split one out.  In this case
4621          we can consider turning a load into a permuted load, if that
4622          turns out to be cheaper than alternatives.  */
4623       if (!any_permute)
4624         {
4625           partition.layout = -1;
4626           continue;
4627         }
4628
4629       /* For now only handle true permutes, like
4630          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4631          when permuting constants and invariants keeping the permute
4632          bijective.  */
4633       auto_sbitmap load_index (SLP_TREE_LANES (node));
4634       bitmap_clear (load_index);
4635       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4636         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4637       unsigned j;
4638       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4639         if (!bitmap_bit_p (load_index, j))
4640           break;
4641       if (j != SLP_TREE_LANES (node))
4642         continue;
4643
4644       vec<unsigned> perm = vNULL;
4645       perm.safe_grow (SLP_TREE_LANES (node), true);
4646       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4647         perm[j] = tmp_perm[j] - imin;
4648
4649       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4650         {
4651           /* Continue to use existing layouts, but don't add any more.  */
4652           int *entry = layout_ids.get (perm);
4653           partition.layout = entry ? *entry : 0;
4654           perm.release ();
4655         }
4656       else
4657         {
4658           bool existed;
4659           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4660           if (existed)
4661             perm.release ();
4662           else
4663             {
4664               layout_i = m_perms.length ();
4665               m_perms.safe_push (perm);
4666             }
4667           partition.layout = layout_i;
4668         }
4669     }
4670
4671   /* Initially assume that every layout is possible and has zero cost
4672      in every partition.  */
4673   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4674                                               * m_perms.length ());
4675
4676   /* We have to mark outgoing permutations facing non-reduction graph
4677      entries that are not represented as to be materialized.  */
4678   for (slp_instance instance : m_vinfo->slp_instances)
4679     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4680       {
4681         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4682         m_partitions[m_vertices[node_i].partition].layout = 0;
4683       }
4684
4685   /* Check which layouts each node and partition can handle.  Calculate the
4686      weights associated with inserting layout changes on edges.  */
4687   for (unsigned int node_i : m_partitioned_nodes)
4688     {
4689       auto &vertex = m_vertices[node_i];
4690       auto &partition = m_partitions[vertex.partition];
4691       slp_tree node = vertex.node;
4692
4693       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4694         {
4695           vertex.weight = vect_slp_node_weight (node);
4696
4697           /* We do not handle stores with a permutation, so all
4698              incoming permutations must have been materialized.
4699
4700              We also don't handle masked grouped loads, which lack a
4701              permutation vector.  In this case the memory locations
4702              form an implicit second input to the loads, on top of the
4703              explicit mask input, and the memory input's layout cannot
4704              be changed.
4705
4706              On the other hand, we do support permuting gather loads and
4707              masked gather loads, where each scalar load is independent
4708              of the others.  This can be useful if the address/index input
4709              benefits from permutation.  */
4710           if (STMT_VINFO_DATA_REF (rep)
4711               && STMT_VINFO_GROUPED_ACCESS (rep)
4712               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4713             partition.layout = 0;
4714
4715           /* We cannot change the layout of an operation that is
4716              not independent on lanes.  Note this is an explicit
4717              negative list since that's much shorter than the respective
4718              positive one but it's critical to keep maintaining it.  */
4719           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4720             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4721               {
4722               case CFN_COMPLEX_ADD_ROT90:
4723               case CFN_COMPLEX_ADD_ROT270:
4724               case CFN_COMPLEX_MUL:
4725               case CFN_COMPLEX_MUL_CONJ:
4726               case CFN_VEC_ADDSUB:
4727               case CFN_VEC_FMADDSUB:
4728               case CFN_VEC_FMSUBADD:
4729                 partition.layout = 0;
4730               default:;
4731               }
4732         }
4733
4734       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4735         {
4736           auto &other_vertex = m_vertices[other_node_i];
4737
4738           /* Count the number of edges from earlier partitions and the number
4739              of edges to later partitions.  */
4740           if (other_vertex.partition < vertex.partition)
4741             partition.in_degree += 1;
4742           else
4743             partition.out_degree += 1;
4744
4745           /* If the current node uses the result of OTHER_NODE_I, accumulate
4746              the effects of that.  */
4747           if (ud->src == int (node_i))
4748             {
4749               other_vertex.out_weight += vertex.weight;
4750               other_vertex.out_degree += 1;
4751             }
4752         };
4753       for_each_partition_edge (node_i, process_edge);
4754     }
4755 }
4756
4757 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4758    its current (provisional) choice of layout.  The inputs do not necessarily
4759    have the same layout as each other.  */
4760
4761 slpg_layout_cost
4762 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4763 {
4764   auto &vertex = m_vertices[node_i];
4765   slpg_layout_cost cost;
4766   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4767     {
4768       auto &other_vertex = m_vertices[other_node_i];
4769       if (other_vertex.partition < vertex.partition)
4770         {
4771           auto &other_partition = m_partitions[other_vertex.partition];
4772           auto &other_costs = partition_layout_costs (other_vertex.partition,
4773                                                       other_partition.layout);
4774           slpg_layout_cost this_cost = other_costs.in_cost;
4775           this_cost.add_serial_cost (other_costs.internal_cost);
4776           this_cost.split (other_partition.out_degree);
4777           cost.add_parallel_cost (this_cost);
4778         }
4779     };
4780   for_each_partition_edge (node_i, add_cost);
4781   return cost;
4782 }
4783
4784 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4785    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4786    slpg_layout_cost::impossible () if the change isn't possible.  */
4787
4788 slpg_layout_cost
4789 vect_optimize_slp_pass::
4790 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4791                   unsigned int layout2_i)
4792 {
4793   auto &def_vertex = m_vertices[ud->dest];
4794   auto &use_vertex = m_vertices[ud->src];
4795   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4796   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4797   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4798                                     use_layout_i);
4799   if (factor < 0)
4800     return slpg_layout_cost::impossible ();
4801
4802   /* We have a choice of putting the layout change at the site of the
4803      definition or at the site of the use.  Prefer the former when
4804      optimizing for size or when the execution frequency of the
4805      definition is no greater than the combined execution frequencies of
4806      the uses.  When putting the layout change at the site of the definition,
4807      divvy up the cost among all consumers.  */
4808   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4809     {
4810       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4811       cost.split (def_vertex.out_degree);
4812       return cost;
4813     }
4814   return { use_vertex.weight * factor, m_optimize_size };
4815 }
4816
4817 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4818    partition; FROM_NODE_I could be the definition node or the use node.
4819    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4820    Return the cost of any necessary fix-ups on edge UD, or return
4821    slpg_layout_cost::impossible () if the change isn't possible.
4822
4823    At this point, FROM_NODE_I's partition has chosen the cheapest
4824    layout based on the information available so far, but this choice
4825    is only provisional.  */
4826
4827 slpg_layout_cost
4828 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4829                                       unsigned int to_layout_i)
4830 {
4831   auto &from_vertex = m_vertices[from_node_i];
4832   unsigned int from_partition_i = from_vertex.partition;
4833   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4834   gcc_assert (from_partition.layout >= 0);
4835
4836   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4837      with its current layout preference.  */
4838   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4839   auto edge_cost = edge_layout_cost (ud, from_node_i,
4840                                      from_partition.layout, to_layout_i);
4841   if (edge_cost.is_possible ())
4842     {
4843       auto &from_costs = partition_layout_costs (from_partition_i,
4844                                                  from_partition.layout);
4845       cost = from_costs.in_cost;
4846       cost.add_serial_cost (from_costs.internal_cost);
4847       cost.split (from_partition.out_degree);
4848       cost.add_serial_cost (edge_cost);
4849     }
4850
4851   /* Take the minimum of that cost and the cost that applies if
4852      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4853   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4854                                                       to_layout_i);
4855   if (direct_layout_costs.is_possible ())
4856     {
4857       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4858       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4859       direct_cost.split (from_partition.out_degree);
4860       if (!cost.is_possible ()
4861           || direct_cost.is_better_than (cost, m_optimize_size))
4862         cost = direct_cost;
4863     }
4864
4865   return cost;
4866 }
4867
4868 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4869    partition; TO_NODE_I could be the definition node or the use node.
4870    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4871    return the cost of any necessary fix-ups on edge UD, or
4872    slpg_layout_cost::impossible () if the choice cannot be made.
4873
4874    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4875
4876 slpg_layout_cost
4877 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4878                                        unsigned int from_layout_i)
4879 {
4880   auto &to_vertex = m_vertices[to_node_i];
4881   unsigned int to_partition_i = to_vertex.partition;
4882   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4883   gcc_assert (to_partition.layout >= 0);
4884
4885   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4886      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4887      any other inputs keep their current choice of layout.  */
4888   auto &to_costs = partition_layout_costs (to_partition_i,
4889                                            to_partition.layout);
4890   if (ud->src == int (to_node_i)
4891       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4892     {
4893       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4894       auto old_layout = from_partition.layout;
4895       from_partition.layout = from_layout_i;
4896       int factor = internal_node_cost (to_vertex.node, -1,
4897                                        to_partition.layout);
4898       from_partition.layout = old_layout;
4899       if (factor >= 0)
4900         {
4901           slpg_layout_cost cost = to_costs.out_cost;
4902           cost.add_serial_cost ({ to_vertex.weight * factor,
4903                                   m_optimize_size });
4904           cost.split (to_partition.in_degree);
4905           return cost;
4906         }
4907     }
4908
4909   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4910   auto edge_cost = edge_layout_cost (ud, to_node_i,
4911                                      to_partition.layout, from_layout_i);
4912   if (edge_cost.is_possible ())
4913     {
4914       slpg_layout_cost cost = to_costs.out_cost;
4915       cost.add_serial_cost (to_costs.internal_cost);
4916       cost.split (to_partition.in_degree);
4917       cost.add_serial_cost (edge_cost);
4918       return cost;
4919     }
4920
4921   return slpg_layout_cost::impossible ();
4922 }
4923
4924 /* Make a forward pass through the partitions, accumulating input costs.
4925    Make a tentative (provisional) choice of layout for each partition,
4926    ensuring that this choice still allows later partitions to keep
4927    their original layout.  */
4928
4929 void
4930 vect_optimize_slp_pass::forward_pass ()
4931 {
4932   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4933        ++partition_i)
4934     {
4935       auto &partition = m_partitions[partition_i];
4936
4937       /* If the partition consists of a single VEC_PERM_EXPR, precompute
4938          the incoming cost that would apply if every predecessor partition
4939          keeps its current layout.  This is used within the loop below.  */
4940       slpg_layout_cost in_cost;
4941       slp_tree single_node = nullptr;
4942       if (partition.node_end == partition.node_begin + 1)
4943         {
4944           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4945           single_node = m_vertices[node_i].node;
4946           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4947             in_cost = total_in_cost (node_i);
4948         }
4949
4950       /* Go through the possible layouts.  Decide which ones are valid
4951          for this partition and record which of the valid layouts has
4952          the lowest cost.  */
4953       unsigned int min_layout_i = 0;
4954       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4955       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4956         {
4957           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4958           if (!layout_costs.is_possible ())
4959             continue;
4960
4961           /* If the recorded layout is already 0 then the layout cannot
4962              change.  */
4963           if (partition.layout == 0 && layout_i != 0)
4964             {
4965               layout_costs.mark_impossible ();
4966               continue;
4967             }
4968
4969           bool is_possible = true;
4970           for (unsigned int order_i = partition.node_begin;
4971                order_i < partition.node_end; ++order_i)
4972             {
4973               unsigned int node_i = m_partitioned_nodes[order_i];
4974               auto &vertex = m_vertices[node_i];
4975
4976               /* Reject the layout if it is individually incompatible
4977                  with any node in the partition.  */
4978               if (!is_compatible_layout (vertex.node, layout_i))
4979                 {
4980                   is_possible = false;
4981                   break;
4982                 }
4983
4984               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4985                 {
4986                   auto &other_vertex = m_vertices[other_node_i];
4987                   if (other_vertex.partition < vertex.partition)
4988                     {
4989                       /* Accumulate the incoming costs from earlier
4990                          partitions, plus the cost of any layout changes
4991                          on UD itself.  */
4992                       auto cost = forward_cost (ud, other_node_i, layout_i);
4993                       if (!cost.is_possible ())
4994                         is_possible = false;
4995                       else
4996                         layout_costs.in_cost.add_parallel_cost (cost);
4997                     }
4998                   else
4999                     /* Reject the layout if it would make layout 0 impossible
5000                        for later partitions.  This amounts to testing that the
5001                        target supports reversing the layout change on edges
5002                        to later partitions.
5003
5004                        In principle, it might be possible to push a layout
5005                        change all the way down a graph, so that it never
5006                        needs to be reversed and so that the target doesn't
5007                        need to support the reverse operation.  But it would
5008                        be awkward to bail out if we hit a partition that
5009                        does not support the new layout, especially since
5010                        we are not dealing with a lattice.  */
5011                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5012                                                      layout_i).is_possible ();
5013                 };
5014               for_each_partition_edge (node_i, add_cost);
5015
5016               /* Accumulate the cost of using LAYOUT_I within NODE,
5017                  both for the inputs and the outputs.  */
5018               int factor = internal_node_cost (vertex.node, layout_i,
5019                                                layout_i);
5020               if (factor < 0)
5021                 {
5022                   is_possible = false;
5023                   break;
5024                 }
5025               else if (factor)
5026                 layout_costs.internal_cost.add_serial_cost
5027                   ({ vertex.weight * factor, m_optimize_size });
5028             }
5029           if (!is_possible)
5030             {
5031               layout_costs.mark_impossible ();
5032               continue;
5033             }
5034
5035           /* Combine the incoming and partition-internal costs.  */
5036           slpg_layout_cost combined_cost = layout_costs.in_cost;
5037           combined_cost.add_serial_cost (layout_costs.internal_cost);
5038
5039           /* If this partition consists of a single VEC_PERM_EXPR, see
5040              if the VEC_PERM_EXPR can be changed to support output layout
5041              LAYOUT_I while keeping all the provisional choices of input
5042              layout.  */
5043           if (single_node
5044               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5045             {
5046               int factor = internal_node_cost (single_node, -1, layout_i);
5047               if (factor >= 0)
5048                 {
5049                   auto weight = m_vertices[single_node->vertex].weight;
5050                   slpg_layout_cost internal_cost
5051                     = { weight * factor, m_optimize_size };
5052
5053                   slpg_layout_cost alt_cost = in_cost;
5054                   alt_cost.add_serial_cost (internal_cost);
5055                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5056                     {
5057                       combined_cost = alt_cost;
5058                       layout_costs.in_cost = in_cost;
5059                       layout_costs.internal_cost = internal_cost;
5060                     }
5061                 }
5062             }
5063
5064           /* Record the layout with the lowest cost.  Prefer layout 0 in
5065              the event of a tie between it and another layout.  */
5066           if (!min_layout_cost.is_possible ()
5067               || combined_cost.is_better_than (min_layout_cost,
5068                                                m_optimize_size))
5069             {
5070               min_layout_i = layout_i;
5071               min_layout_cost = combined_cost;
5072             }
5073         }
5074
5075       /* This loop's handling of earlier partitions should ensure that
5076          choosing the original layout for the current partition is no
5077          less valid than it was in the original graph, even with the
5078          provisional layout choices for those earlier partitions.  */
5079       gcc_assert (min_layout_cost.is_possible ());
5080       partition.layout = min_layout_i;
5081     }
5082 }
5083
5084 /* Make a backward pass through the partitions, accumulating output costs.
5085    Make a final choice of layout for each partition.  */
5086
5087 void
5088 vect_optimize_slp_pass::backward_pass ()
5089 {
5090   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5091     {
5092       auto &partition = m_partitions[partition_i];
5093
5094       unsigned int min_layout_i = 0;
5095       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5096       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5097         {
5098           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5099           if (!layout_costs.is_possible ())
5100             continue;
5101
5102           /* Accumulate the costs from successor partitions.  */
5103           bool is_possible = true;
5104           for (unsigned int order_i = partition.node_begin;
5105                order_i < partition.node_end; ++order_i)
5106             {
5107               unsigned int node_i = m_partitioned_nodes[order_i];
5108               auto &vertex = m_vertices[node_i];
5109               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5110                 {
5111                   auto &other_vertex = m_vertices[other_node_i];
5112                   auto &other_partition = m_partitions[other_vertex.partition];
5113                   if (other_vertex.partition > vertex.partition)
5114                     {
5115                       /* Accumulate the incoming costs from later
5116                          partitions, plus the cost of any layout changes
5117                          on UD itself.  */
5118                       auto cost = backward_cost (ud, other_node_i, layout_i);
5119                       if (!cost.is_possible ())
5120                         is_possible = false;
5121                       else
5122                         layout_costs.out_cost.add_parallel_cost (cost);
5123                     }
5124                   else
5125                     /* Make sure that earlier partitions can (if necessary
5126                        or beneficial) keep the layout that they chose in
5127                        the forward pass.  This ensures that there is at
5128                        least one valid choice of layout.  */
5129                     is_possible &= edge_layout_cost (ud, other_node_i,
5130                                                      other_partition.layout,
5131                                                      layout_i).is_possible ();
5132                 };
5133               for_each_partition_edge (node_i, add_cost);
5134             }
5135           if (!is_possible)
5136             {
5137               layout_costs.mark_impossible ();
5138               continue;
5139             }
5140
5141           /* Locally combine the costs from the forward and backward passes.
5142              (This combined cost is not passed on, since that would lead
5143              to double counting.)  */
5144           slpg_layout_cost combined_cost = layout_costs.in_cost;
5145           combined_cost.add_serial_cost (layout_costs.internal_cost);
5146           combined_cost.add_serial_cost (layout_costs.out_cost);
5147
5148           /* Record the layout with the lowest cost.  Prefer layout 0 in
5149              the event of a tie between it and another layout.  */
5150           if (!min_layout_cost.is_possible ()
5151               || combined_cost.is_better_than (min_layout_cost,
5152                                                m_optimize_size))
5153             {
5154               min_layout_i = layout_i;
5155               min_layout_cost = combined_cost;
5156             }
5157         }
5158
5159       gcc_assert (min_layout_cost.is_possible ());
5160       partition.layout = min_layout_i;
5161     }
5162 }
5163
5164 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5165    NODE already has the layout that was selected for its partition.  */
5166
5167 slp_tree
5168 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5169                                                 unsigned int to_layout_i)
5170 {
5171   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5172   slp_tree result = m_node_layouts[result_i];
5173   if (result)
5174     return result;
5175
5176   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5177       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5178     {
5179       /* If the vector is uniform or unchanged, there's nothing to do.  */
5180       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5181         result = node;
5182       else
5183         {
5184           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5185           result = vect_create_new_slp_node (scalar_ops);
5186           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5187         }
5188     }
5189   else
5190     {
5191       unsigned int partition_i = m_vertices[node->vertex].partition;
5192       unsigned int from_layout_i = m_partitions[partition_i].layout;
5193       if (from_layout_i == to_layout_i)
5194         return node;
5195
5196       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5197          permutation instead of a serial one.  Leave the new permutation
5198          in TMP_PERM on success.  */
5199       auto_lane_permutation_t tmp_perm;
5200       unsigned int num_inputs = 1;
5201       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5202         {
5203           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5204           if (from_layout_i != 0)
5205             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5206           if (to_layout_i != 0)
5207             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5208           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5209                                               tmp_perm,
5210                                               SLP_TREE_CHILDREN (node),
5211                                               false) >= 0)
5212             num_inputs = SLP_TREE_CHILDREN (node).length ();
5213           else
5214             tmp_perm.truncate (0);
5215         }
5216
5217       if (dump_enabled_p ())
5218         {
5219           if (tmp_perm.length () > 0)
5220             dump_printf_loc (MSG_NOTE, vect_location,
5221                              "duplicating permutation node %p with"
5222                              " layout %d\n",
5223                              (void *) node, to_layout_i);
5224           else
5225             dump_printf_loc (MSG_NOTE, vect_location,
5226                              "inserting permutation node in place of %p\n",
5227                              (void *) node);
5228         }
5229
5230       unsigned int num_lanes = SLP_TREE_LANES (node);
5231       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5232       if (SLP_TREE_SCALAR_STMTS (node).length ())
5233         {
5234           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5235           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5236           if (from_layout_i != 0)
5237             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5238           if (to_layout_i != 0)
5239             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5240         }
5241       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5242       SLP_TREE_LANES (result) = num_lanes;
5243       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5244       result->vertex = -1;
5245
5246       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5247       if (tmp_perm.length ())
5248         {
5249           lane_perm.safe_splice (tmp_perm);
5250           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5251         }
5252       else
5253         {
5254           lane_perm.create (num_lanes);
5255           for (unsigned j = 0; j < num_lanes; ++j)
5256             lane_perm.quick_push ({ 0, j });
5257           if (from_layout_i != 0)
5258             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5259           if (to_layout_i != 0)
5260             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5261           SLP_TREE_CHILDREN (result).safe_push (node);
5262         }
5263       for (slp_tree child : SLP_TREE_CHILDREN (result))
5264         child->refcnt++;
5265     }
5266   m_node_layouts[result_i] = result;
5267   return result;
5268 }
5269
5270 /* Apply the chosen vector layouts to the SLP graph.  */
5271
5272 void
5273 vect_optimize_slp_pass::materialize ()
5274 {
5275   /* We no longer need the costs, so avoid having two O(N * P) arrays
5276      live at the same time.  */
5277   m_partition_layout_costs.release ();
5278   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5279
5280   auto_sbitmap fully_folded (m_vertices.length ());
5281   bitmap_clear (fully_folded);
5282   for (unsigned int node_i : m_partitioned_nodes)
5283     {
5284       auto &vertex = m_vertices[node_i];
5285       slp_tree node = vertex.node;
5286       int layout_i = m_partitions[vertex.partition].layout;
5287       gcc_assert (layout_i >= 0);
5288
5289       /* Rearrange the scalar statements to match the chosen layout.  */
5290       if (layout_i > 0)
5291         vect_slp_permute (m_perms[layout_i],
5292                           SLP_TREE_SCALAR_STMTS (node), true);
5293
5294       /* Update load and lane permutations.  */
5295       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5296         {
5297           /* First try to absorb the input vector layouts.  If that fails,
5298              force the inputs to have layout LAYOUT_I too.  We checked that
5299              that was possible before deciding to use nonzero output layouts.
5300              (Note that at this stage we don't really have any guarantee that
5301              the target supports the original VEC_PERM_EXPR.)  */
5302           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5303           auto_lane_permutation_t tmp_perm;
5304           tmp_perm.safe_splice (perm);
5305           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5306           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5307                                               tmp_perm,
5308                                               SLP_TREE_CHILDREN (node),
5309                                               false) >= 0)
5310             {
5311               if (dump_enabled_p ()
5312                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5313                                   perm.begin ()))
5314                 dump_printf_loc (MSG_NOTE, vect_location,
5315                                  "absorbing input layouts into %p\n",
5316                                  (void *) node);
5317               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5318               bitmap_set_bit (fully_folded, node_i);
5319             }
5320           else
5321             {
5322               /* Not MSG_MISSED because it would make no sense to users.  */
5323               if (dump_enabled_p ())
5324                 dump_printf_loc (MSG_NOTE, vect_location,
5325                                  "failed to absorb input layouts into %p\n",
5326                                  (void *) node);
5327               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5328             }
5329         }
5330       else
5331         {
5332           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5333           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5334           if (layout_i > 0)
5335             /* ???  When we handle non-bijective permutes the idea
5336                is that we can force the load-permutation to be
5337                { min, min + 1, min + 2, ... max }.  But then the
5338                scalar defs might no longer match the lane content
5339                which means wrong-code with live lane vectorization.
5340                So we possibly have to have NULL entries for those.  */
5341             vect_slp_permute (m_perms[layout_i], load_perm, true);
5342         }
5343     }
5344
5345   /* Do this before any nodes disappear, since it involves a walk
5346      over the leaves.  */
5347   remove_redundant_permutations ();
5348
5349   /* Replace each child with a correctly laid-out version.  */
5350   for (unsigned int node_i : m_partitioned_nodes)
5351     {
5352       /* Skip nodes that have already been handled above.  */
5353       if (bitmap_bit_p (fully_folded, node_i))
5354         continue;
5355
5356       auto &vertex = m_vertices[node_i];
5357       int in_layout_i = m_partitions[vertex.partition].layout;
5358       gcc_assert (in_layout_i >= 0);
5359
5360       unsigned j;
5361       slp_tree child;
5362       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5363         {
5364           if (!child)
5365             continue;
5366
5367           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5368           if (new_child != child)
5369             {
5370               vect_free_slp_tree (child);
5371               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5372               new_child->refcnt += 1;
5373             }
5374         }
5375     }
5376 }
5377
5378 /* Elide load permutations that are not necessary.  Such permutations might
5379    be pre-existing, rather than created by the layout optimizations.  */
5380
5381 void
5382 vect_optimize_slp_pass::remove_redundant_permutations ()
5383 {
5384   for (unsigned int node_i : m_leafs)
5385     {
5386       slp_tree node = m_vertices[node_i].node;
5387       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5388         continue;
5389
5390       /* In basic block vectorization we allow any subchain of an interleaving
5391          chain.
5392          FORNOW: not in loop SLP because of realignment complications.  */
5393       if (is_a <bb_vec_info> (m_vinfo))
5394         {
5395           bool subchain_p = true;
5396           stmt_vec_info next_load_info = NULL;
5397           stmt_vec_info load_info;
5398           unsigned j;
5399           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5400             {
5401               if (j != 0
5402                   && (next_load_info != load_info
5403                       || DR_GROUP_GAP (load_info) != 1))
5404                 {
5405                   subchain_p = false;
5406                   break;
5407                 }
5408               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5409             }
5410           if (subchain_p)
5411             {
5412               SLP_TREE_LOAD_PERMUTATION (node).release ();
5413               continue;
5414             }
5415         }
5416       else
5417         {
5418           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5419           stmt_vec_info load_info;
5420           bool this_load_permuted = false;
5421           unsigned j;
5422           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5423             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5424               {
5425                 this_load_permuted = true;
5426                 break;
5427               }
5428           stmt_vec_info first_stmt_info
5429             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5430           if (!this_load_permuted
5431               /* The load requires permutation when unrolling exposes
5432                  a gap either because the group is larger than the SLP
5433                  group-size or because there is a gap between the groups.  */
5434               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5435                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5436                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5437             {
5438               SLP_TREE_LOAD_PERMUTATION (node).release ();
5439               continue;
5440             }
5441         }
5442     }
5443 }
5444
5445 /* Print the partition graph and layout information to the dump file.  */
5446
5447 void
5448 vect_optimize_slp_pass::dump ()
5449 {
5450   dump_printf_loc (MSG_NOTE, vect_location,
5451                    "SLP optimize permutations:\n");
5452   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5453     {
5454       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5455       const char *sep = "";
5456       for (unsigned int idx : m_perms[layout_i])
5457         {
5458           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5459           sep = ", ";
5460         }
5461       dump_printf (MSG_NOTE, " }\n");
5462     }
5463   dump_printf_loc (MSG_NOTE, vect_location,
5464                    "SLP optimize partitions:\n");
5465   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5466        ++partition_i)
5467     {
5468       auto &partition = m_partitions[partition_i];
5469       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5470       dump_printf_loc (MSG_NOTE, vect_location,
5471                        "  partition %d (layout %d):\n",
5472                        partition_i, partition.layout);
5473       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5474       for (unsigned int order_i = partition.node_begin;
5475            order_i < partition.node_end; ++order_i)
5476         {
5477           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5478           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5479                            (void *) vertex.node);
5480           dump_printf_loc (MSG_NOTE, vect_location,
5481                            "          weight: %f\n",
5482                            vertex.weight.to_double ());
5483           if (vertex.out_degree)
5484             dump_printf_loc (MSG_NOTE, vect_location,
5485                              "          out weight: %f (degree %d)\n",
5486                              vertex.out_weight.to_double (),
5487                              vertex.out_degree);
5488           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5489             dump_printf_loc (MSG_NOTE, vect_location,
5490                              "          op: VEC_PERM_EXPR\n");
5491           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5492             dump_printf_loc (MSG_NOTE, vect_location,
5493                              "          op template: %G", rep->stmt);
5494         }
5495       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5496       for (unsigned int order_i = partition.node_begin;
5497            order_i < partition.node_end; ++order_i)
5498         {
5499           unsigned int node_i = m_partitioned_nodes[order_i];
5500           auto &vertex = m_vertices[node_i];
5501           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5502             {
5503               auto &other_vertex = m_vertices[other_node_i];
5504               if (other_vertex.partition < vertex.partition)
5505                 dump_printf_loc (MSG_NOTE, vect_location,
5506                                  "      - %p [%d] --> %p\n",
5507                                  (void *) other_vertex.node,
5508                                  other_vertex.partition,
5509                                  (void *) vertex.node);
5510               else
5511                 dump_printf_loc (MSG_NOTE, vect_location,
5512                                  "      - %p --> [%d] %p\n",
5513                                  (void *) vertex.node,
5514                                  other_vertex.partition,
5515                                  (void *) other_vertex.node);
5516             };
5517           for_each_partition_edge (node_i, print_edge);
5518         }
5519
5520       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5521         {
5522           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5523           if (layout_costs.is_possible ())
5524             {
5525               dump_printf_loc (MSG_NOTE, vect_location,
5526                                "    layout %d:%s\n", layout_i,
5527                                partition.layout == int (layout_i)
5528                                ? " (*)" : "");
5529               slpg_layout_cost combined_cost = layout_costs.in_cost;
5530               combined_cost.add_serial_cost (layout_costs.internal_cost);
5531               combined_cost.add_serial_cost (layout_costs.out_cost);
5532 #define TEMPLATE "{depth: %f, total: %f}"
5533               dump_printf_loc (MSG_NOTE, vect_location,
5534                                "        " TEMPLATE "\n",
5535                                layout_costs.in_cost.depth.to_double (),
5536                                layout_costs.in_cost.total.to_double ());
5537               dump_printf_loc (MSG_NOTE, vect_location,
5538                                "      + " TEMPLATE "\n",
5539                                layout_costs.internal_cost.depth.to_double (),
5540                                layout_costs.internal_cost.total.to_double ());
5541               dump_printf_loc (MSG_NOTE, vect_location,
5542                                "      + " TEMPLATE "\n",
5543                                layout_costs.out_cost.depth.to_double (),
5544                                layout_costs.out_cost.total.to_double ());
5545               dump_printf_loc (MSG_NOTE, vect_location,
5546                                "      = " TEMPLATE "\n",
5547                                combined_cost.depth.to_double (),
5548                                combined_cost.total.to_double ());
5549 #undef TEMPLATE
5550             }
5551           else
5552             dump_printf_loc (MSG_NOTE, vect_location,
5553                              "    layout %d: rejected\n", layout_i);
5554         }
5555     }
5556 }
5557
5558 /* Main entry point for the SLP graph optimization pass.  */
5559
5560 void
5561 vect_optimize_slp_pass::run ()
5562 {
5563   build_graph ();
5564   create_partitions ();
5565   start_choosing_layouts ();
5566   if (m_perms.length () > 1)
5567     {
5568       forward_pass ();
5569       backward_pass ();
5570       if (dump_enabled_p ())
5571         dump ();
5572       materialize ();
5573       while (!m_perms.is_empty ())
5574         m_perms.pop ().release ();
5575     }
5576   else
5577     remove_redundant_permutations ();
5578   free_graph (m_slpg);
5579 }
5580
5581 /* Optimize the SLP graph of VINFO.  */
5582
5583 void
5584 vect_optimize_slp (vec_info *vinfo)
5585 {
5586   if (vinfo->slp_instances.is_empty ())
5587     return;
5588   vect_optimize_slp_pass (vinfo).run ();
5589 }
5590
5591 /* Gather loads reachable from the individual SLP graph entries.  */
5592
5593 void
5594 vect_gather_slp_loads (vec_info *vinfo)
5595 {
5596   unsigned i;
5597   slp_instance instance;
5598   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5599     {
5600       hash_set<slp_tree> visited;
5601       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5602                              SLP_INSTANCE_TREE (instance), visited);
5603     }
5604 }
5605
5606
5607 /* For each possible SLP instance decide whether to SLP it and calculate overall
5608    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5609    least one instance.  */
5610
5611 bool
5612 vect_make_slp_decision (loop_vec_info loop_vinfo)
5613 {
5614   unsigned int i;
5615   poly_uint64 unrolling_factor = 1;
5616   const vec<slp_instance> &slp_instances
5617     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5618   slp_instance instance;
5619   int decided_to_slp = 0;
5620
5621   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5622
5623   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5624     {
5625       /* FORNOW: SLP if you can.  */
5626       /* All unroll factors have the form:
5627
5628            GET_MODE_SIZE (vinfo->vector_mode) * X
5629
5630          for some rational X, so they must have a common multiple.  */
5631       unrolling_factor
5632         = force_common_multiple (unrolling_factor,
5633                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5634
5635       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5636          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5637          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5638       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5639       decided_to_slp++;
5640     }
5641
5642   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5643
5644   if (decided_to_slp && dump_enabled_p ())
5645     {
5646       dump_printf_loc (MSG_NOTE, vect_location,
5647                        "Decided to SLP %d instances. Unrolling factor ",
5648                        decided_to_slp);
5649       dump_dec (MSG_NOTE, unrolling_factor);
5650       dump_printf (MSG_NOTE, "\n");
5651     }
5652
5653   return (decided_to_slp > 0);
5654 }
5655
5656 /* Private data for vect_detect_hybrid_slp.  */
5657 struct vdhs_data
5658 {
5659   loop_vec_info loop_vinfo;
5660   vec<stmt_vec_info> *worklist;
5661 };
5662
5663 /* Walker for walk_gimple_op.  */
5664
5665 static tree
5666 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5667 {
5668   walk_stmt_info *wi = (walk_stmt_info *)data;
5669   vdhs_data *dat = (vdhs_data *)wi->info;
5670
5671   if (wi->is_lhs)
5672     return NULL_TREE;
5673
5674   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5675   if (!def_stmt_info)
5676     return NULL_TREE;
5677   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5678   if (PURE_SLP_STMT (def_stmt_info))
5679     {
5680       if (dump_enabled_p ())
5681         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5682                          def_stmt_info->stmt);
5683       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5684       dat->worklist->safe_push (def_stmt_info);
5685     }
5686
5687   return NULL_TREE;
5688 }
5689
5690 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5691    if so, otherwise pushing it to WORKLIST.  */
5692
5693 static void
5694 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5695                                vec<stmt_vec_info> &worklist,
5696                                stmt_vec_info stmt_info)
5697 {
5698   if (dump_enabled_p ())
5699     dump_printf_loc (MSG_NOTE, vect_location,
5700                      "Processing hybrid candidate : %G", stmt_info->stmt);
5701   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5702   imm_use_iterator iter2;
5703   ssa_op_iter iter1;
5704   use_operand_p use_p;
5705   def_operand_p def_p;
5706   bool any_def = false;
5707   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5708     {
5709       any_def = true;
5710       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5711         {
5712           if (is_gimple_debug (USE_STMT (use_p)))
5713             continue;
5714           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5715           /* An out-of loop use means this is a loop_vect sink.  */
5716           if (!use_info)
5717             {
5718               if (dump_enabled_p ())
5719                 dump_printf_loc (MSG_NOTE, vect_location,
5720                                  "Found loop_vect sink: %G", stmt_info->stmt);
5721               worklist.safe_push (stmt_info);
5722               return;
5723             }
5724           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5725             {
5726               if (dump_enabled_p ())
5727                 dump_printf_loc (MSG_NOTE, vect_location,
5728                                  "Found loop_vect use: %G", use_info->stmt);
5729               worklist.safe_push (stmt_info);
5730               return;
5731             }
5732         }
5733     }
5734   /* No def means this is a loo_vect sink.  */
5735   if (!any_def)
5736     {
5737       if (dump_enabled_p ())
5738         dump_printf_loc (MSG_NOTE, vect_location,
5739                          "Found loop_vect sink: %G", stmt_info->stmt);
5740       worklist.safe_push (stmt_info);
5741       return;
5742     }
5743   if (dump_enabled_p ())
5744     dump_printf_loc (MSG_NOTE, vect_location,
5745                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5746   STMT_SLP_TYPE (stmt_info) = pure_slp;
5747 }
5748
5749 /* Find stmts that must be both vectorized and SLPed.  */
5750
5751 void
5752 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5753 {
5754   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5755
5756   /* All stmts participating in SLP are marked pure_slp, all other
5757      stmts are loop_vect.
5758      First collect all loop_vect stmts into a worklist.
5759      SLP patterns cause not all original scalar stmts to appear in
5760      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5761      Rectify this here and do a backward walk over the IL only considering
5762      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5763      mark them as pure_slp.  */
5764   auto_vec<stmt_vec_info> worklist;
5765   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5766     {
5767       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5768       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5769            gsi_next (&gsi))
5770         {
5771           gphi *phi = gsi.phi ();
5772           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5773           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5774             maybe_push_to_hybrid_worklist (loop_vinfo,
5775                                            worklist, stmt_info);
5776         }
5777       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5778            gsi_prev (&gsi))
5779         {
5780           gimple *stmt = gsi_stmt (gsi);
5781           if (is_gimple_debug (stmt))
5782             continue;
5783           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5784           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5785             {
5786               for (gimple_stmt_iterator gsi2
5787                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5788                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5789                 {
5790                   stmt_vec_info patt_info
5791                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5792                   if (!STMT_SLP_TYPE (patt_info)
5793                       && STMT_VINFO_RELEVANT (patt_info))
5794                     maybe_push_to_hybrid_worklist (loop_vinfo,
5795                                                    worklist, patt_info);
5796                 }
5797               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5798             }
5799           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5800             maybe_push_to_hybrid_worklist (loop_vinfo,
5801                                            worklist, stmt_info);
5802         }
5803     }
5804
5805   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5806      mark any SLP vectorized stmt as hybrid.
5807      ???  We're visiting def stmts N times (once for each non-SLP and
5808      once for each hybrid-SLP use).  */
5809   walk_stmt_info wi;
5810   vdhs_data dat;
5811   dat.worklist = &worklist;
5812   dat.loop_vinfo = loop_vinfo;
5813   memset (&wi, 0, sizeof (wi));
5814   wi.info = (void *)&dat;
5815   while (!worklist.is_empty ())
5816     {
5817       stmt_vec_info stmt_info = worklist.pop ();
5818       /* Since SSA operands are not set up for pattern stmts we need
5819          to use walk_gimple_op.  */
5820       wi.is_lhs = 0;
5821       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5822       /* For gather/scatter make sure to walk the offset operand, that
5823          can be a scaling and conversion away.  */
5824       gather_scatter_info gs_info;
5825       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5826           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5827         {
5828           int dummy;
5829           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5830         }
5831     }
5832 }
5833
5834
5835 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5836
5837 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5838   : vec_info (vec_info::bb, shared),
5839     bbs (_bbs),
5840     roots (vNULL)
5841 {
5842   for (unsigned i = 0; i < bbs.length (); ++i)
5843     {
5844       if (i != 0)
5845         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5846              gsi_next (&si))
5847           {
5848             gphi *phi = si.phi ();
5849             gimple_set_uid (phi, 0);
5850             add_stmt (phi);
5851           }
5852       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5853            !gsi_end_p (gsi); gsi_next (&gsi))
5854         {
5855           gimple *stmt = gsi_stmt (gsi);
5856           gimple_set_uid (stmt, 0);
5857           if (is_gimple_debug (stmt))
5858             continue;
5859           add_stmt (stmt);
5860         }
5861     }
5862 }
5863
5864
5865 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5866    stmts in the basic block.  */
5867
5868 _bb_vec_info::~_bb_vec_info ()
5869 {
5870   /* Reset region marker.  */
5871   for (unsigned i = 0; i < bbs.length (); ++i)
5872     {
5873       if (i != 0)
5874         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5875              gsi_next (&si))
5876           {
5877             gphi *phi = si.phi ();
5878             gimple_set_uid (phi, -1);
5879           }
5880       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5881            !gsi_end_p (gsi); gsi_next (&gsi))
5882         {
5883           gimple *stmt = gsi_stmt (gsi);
5884           gimple_set_uid (stmt, -1);
5885         }
5886     }
5887
5888   for (unsigned i = 0; i < roots.length (); ++i)
5889     {
5890       roots[i].stmts.release ();
5891       roots[i].roots.release ();
5892     }
5893   roots.release ();
5894 }
5895
5896 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5897    given then that child nodes have already been processed, and that
5898    their def types currently match their SLP node's def type.  */
5899
5900 static bool
5901 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5902                                     slp_instance node_instance,
5903                                     stmt_vector_for_cost *cost_vec)
5904 {
5905   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5906
5907   /* Calculate the number of vector statements to be created for the
5908      scalar stmts in this node.  For SLP reductions it is equal to the
5909      number of vector statements in the children (which has already been
5910      calculated by the recursive call).  Otherwise it is the number of
5911      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5912      VF divided by the number of elements in a vector.  */
5913   if (!STMT_VINFO_DATA_REF (stmt_info)
5914       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5915     {
5916       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5917         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5918           {
5919             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5920               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5921             break;
5922           }
5923     }
5924   else
5925     {
5926       poly_uint64 vf;
5927       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5928         vf = loop_vinfo->vectorization_factor;
5929       else
5930         vf = 1;
5931       unsigned int group_size = SLP_TREE_LANES (node);
5932       tree vectype = SLP_TREE_VECTYPE (node);
5933       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5934         = vect_get_num_vectors (vf * group_size, vectype);
5935     }
5936
5937   /* Handle purely internal nodes.  */
5938   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5939     {
5940       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5941         return false;
5942
5943       stmt_vec_info slp_stmt_info;
5944       unsigned int i;
5945       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5946         {
5947           if (STMT_VINFO_LIVE_P (slp_stmt_info)
5948               && !vectorizable_live_operation (vinfo,
5949                                                slp_stmt_info, NULL, node,
5950                                                node_instance, i,
5951                                                false, cost_vec))
5952             return false;
5953         }
5954       return true;
5955     }
5956
5957   bool dummy;
5958   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5959                             node, node_instance, cost_vec);
5960 }
5961
5962 /* Try to build NODE from scalars, returning true on success.
5963    NODE_INSTANCE is the SLP instance that contains NODE.  */
5964
5965 static bool
5966 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5967                               slp_instance node_instance)
5968 {
5969   stmt_vec_info stmt_info;
5970   unsigned int i;
5971
5972   if (!is_a <bb_vec_info> (vinfo)
5973       || node == SLP_INSTANCE_TREE (node_instance)
5974       || !SLP_TREE_SCALAR_STMTS (node).exists ()
5975       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5976       /* Force the mask use to be built from scalars instead.  */
5977       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5978     return false;
5979
5980   if (dump_enabled_p ())
5981     dump_printf_loc (MSG_NOTE, vect_location,
5982                      "Building vector operands of %p from scalars instead\n",
5983                      (void *) node);
5984
5985   /* Don't remove and free the child nodes here, since they could be
5986      referenced by other structures.  The analysis and scheduling phases
5987      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
5988   unsigned int group_size = SLP_TREE_LANES (node);
5989   SLP_TREE_DEF_TYPE (node) = vect_external_def;
5990   /* Invariants get their vector type from the uses.  */
5991   SLP_TREE_VECTYPE (node) = NULL_TREE;
5992   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5993   SLP_TREE_LOAD_PERMUTATION (node).release ();
5994   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5995     {
5996       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5997       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5998     }
5999   return true;
6000 }
6001
6002 /* Return true if all elements of the slice are the same.  */
6003 bool
6004 vect_scalar_ops_slice::all_same_p () const
6005 {
6006   for (unsigned int i = 1; i < length; ++i)
6007     if (!operand_equal_p (op (0), op (i)))
6008       return false;
6009   return true;
6010 }
6011
6012 hashval_t
6013 vect_scalar_ops_slice_hash::hash (const value_type &s)
6014 {
6015   hashval_t hash = 0;
6016   for (unsigned i = 0; i < s.length; ++i)
6017     hash = iterative_hash_expr (s.op (i), hash);
6018   return hash;
6019 }
6020
6021 bool
6022 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6023                                    const compare_type &s2)
6024 {
6025   if (s1.length != s2.length)
6026     return false;
6027   for (unsigned i = 0; i < s1.length; ++i)
6028     if (!operand_equal_p (s1.op (i), s2.op (i)))
6029       return false;
6030   return true;
6031 }
6032
6033 /* Compute the prologue cost for invariant or constant operands represented
6034    by NODE.  */
6035
6036 static void
6037 vect_prologue_cost_for_slp (slp_tree node,
6038                             stmt_vector_for_cost *cost_vec)
6039 {
6040   /* There's a special case of an existing vector, that costs nothing.  */
6041   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6042       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6043     return;
6044   /* Without looking at the actual initializer a vector of
6045      constants can be implemented as load from the constant pool.
6046      When all elements are the same we can use a splat.  */
6047   tree vectype = SLP_TREE_VECTYPE (node);
6048   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6049   unsigned HOST_WIDE_INT const_nunits;
6050   unsigned nelt_limit;
6051   auto ops = &SLP_TREE_SCALAR_OPS (node);
6052   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6053   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6054       && ! multiple_p (const_nunits, group_size))
6055     {
6056       nelt_limit = const_nunits;
6057       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6058       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6059         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6060           starts.quick_push (i * const_nunits);
6061     }
6062   else
6063     {
6064       /* If either the vector has variable length or the vectors
6065          are composed of repeated whole groups we only need to
6066          cost construction once.  All vectors will be the same.  */
6067       nelt_limit = group_size;
6068       starts.quick_push (0);
6069     }
6070   /* ???  We're just tracking whether vectors in a single node are the same.
6071      Ideally we'd do something more global.  */
6072   bool passed = false;
6073   for (unsigned int start : starts)
6074     {
6075       vect_cost_for_stmt kind;
6076       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6077         kind = vector_load;
6078       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6079         kind = scalar_to_vec;
6080       else
6081         kind = vec_construct;
6082       /* The target cost hook has no idea which part of the SLP node
6083          we are costing so avoid passing it down more than once.  Pass
6084          it to the first vec_construct or scalar_to_vec part since for those
6085          the x86 backend tries to account for GPR to XMM register moves.  */
6086       record_stmt_cost (cost_vec, 1, kind,
6087                         (kind != vector_load && !passed) ? node : nullptr,
6088                         vectype, 0, vect_prologue);
6089       if (kind != vector_load)
6090         passed = true;
6091     }
6092 }
6093
6094 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6095    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6096
6097    Return true if the operations are supported.  */
6098
6099 static bool
6100 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6101                                   slp_instance node_instance,
6102                                   hash_set<slp_tree> &visited_set,
6103                                   vec<slp_tree> &visited_vec,
6104                                   stmt_vector_for_cost *cost_vec)
6105 {
6106   int i, j;
6107   slp_tree child;
6108
6109   /* Assume we can code-generate all invariants.  */
6110   if (!node
6111       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6112       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6113     return true;
6114
6115   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6116     {
6117       if (dump_enabled_p ())
6118         dump_printf_loc (MSG_NOTE, vect_location,
6119                          "Failed cyclic SLP reference in %p\n", (void *) node);
6120       return false;
6121     }
6122   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6123
6124   /* If we already analyzed the exact same set of scalar stmts we're done.
6125      We share the generated vector stmts for those.  */
6126   if (visited_set.add (node))
6127     return true;
6128   visited_vec.safe_push (node);
6129
6130   bool res = true;
6131   unsigned visited_rec_start = visited_vec.length ();
6132   unsigned cost_vec_rec_start = cost_vec->length ();
6133   bool seen_non_constant_child = false;
6134   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6135     {
6136       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6137                                               visited_set, visited_vec,
6138                                               cost_vec);
6139       if (!res)
6140         break;
6141       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6142         seen_non_constant_child = true;
6143     }
6144   /* We're having difficulties scheduling nodes with just constant
6145      operands and no scalar stmts since we then cannot compute a stmt
6146      insertion place.  */
6147   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6148     {
6149       if (dump_enabled_p ())
6150         dump_printf_loc (MSG_NOTE, vect_location,
6151                          "Cannot vectorize all-constant op node %p\n",
6152                          (void *) node);
6153       res = false;
6154     }
6155
6156   if (res)
6157     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6158                                               cost_vec);
6159   /* If analysis failed we have to pop all recursive visited nodes
6160      plus ourselves.  */
6161   if (!res)
6162     {
6163       while (visited_vec.length () >= visited_rec_start)
6164         visited_set.remove (visited_vec.pop ());
6165       cost_vec->truncate (cost_vec_rec_start);
6166     }
6167
6168   /* When the node can be vectorized cost invariant nodes it references.
6169      This is not done in DFS order to allow the refering node
6170      vectorizable_* calls to nail down the invariant nodes vector type
6171      and possibly unshare it if it needs a different vector type than
6172      other referrers.  */
6173   if (res)
6174     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6175       if (child
6176           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6177               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6178           /* Perform usual caching, note code-generation still
6179              code-gens these nodes multiple times but we expect
6180              to CSE them later.  */
6181           && !visited_set.add (child))
6182         {
6183           visited_vec.safe_push (child);
6184           /* ???  After auditing more code paths make a "default"
6185              and push the vector type from NODE to all children
6186              if it is not already set.  */
6187           /* Compute the number of vectors to be generated.  */
6188           tree vector_type = SLP_TREE_VECTYPE (child);
6189           if (!vector_type)
6190             {
6191               /* For shifts with a scalar argument we don't need
6192                  to cost or code-generate anything.
6193                  ???  Represent this more explicitely.  */
6194               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6195                            == shift_vec_info_type)
6196                           && j == 1);
6197               continue;
6198             }
6199           unsigned group_size = SLP_TREE_LANES (child);
6200           poly_uint64 vf = 1;
6201           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6202             vf = loop_vinfo->vectorization_factor;
6203           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6204             = vect_get_num_vectors (vf * group_size, vector_type);
6205           /* And cost them.  */
6206           vect_prologue_cost_for_slp (child, cost_vec);
6207         }
6208
6209   /* If this node or any of its children can't be vectorized, try pruning
6210      the tree here rather than felling the whole thing.  */
6211   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6212     {
6213       /* We'll need to revisit this for invariant costing and number
6214          of vectorized stmt setting.   */
6215       res = true;
6216     }
6217
6218   return res;
6219 }
6220
6221 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6222    region and that can be vectorized using vectorizable_live_operation
6223    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6224    scalar code computing it to be retained.  */
6225
6226 static void
6227 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6228                              slp_instance instance,
6229                              stmt_vector_for_cost *cost_vec,
6230                              hash_set<stmt_vec_info> &svisited,
6231                              hash_set<slp_tree> &visited)
6232 {
6233   if (visited.add (node))
6234     return;
6235
6236   unsigned i;
6237   stmt_vec_info stmt_info;
6238   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6239   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6240     {
6241       if (svisited.contains (stmt_info))
6242         continue;
6243       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6244       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6245           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6246         /* Only the pattern root stmt computes the original scalar value.  */
6247         continue;
6248       bool mark_visited = true;
6249       gimple *orig_stmt = orig_stmt_info->stmt;
6250       ssa_op_iter op_iter;
6251       def_operand_p def_p;
6252       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6253         {
6254           imm_use_iterator use_iter;
6255           gimple *use_stmt;
6256           stmt_vec_info use_stmt_info;
6257           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6258             if (!is_gimple_debug (use_stmt))
6259               {
6260                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6261                 if (!use_stmt_info
6262                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6263                   {
6264                     STMT_VINFO_LIVE_P (stmt_info) = true;
6265                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6266                                                      NULL, node, instance, i,
6267                                                      false, cost_vec))
6268                       /* ???  So we know we can vectorize the live stmt
6269                          from one SLP node.  If we cannot do so from all
6270                          or none consistently we'd have to record which
6271                          SLP node (and lane) we want to use for the live
6272                          operation.  So make sure we can code-generate
6273                          from all nodes.  */
6274                       mark_visited = false;
6275                     else
6276                       STMT_VINFO_LIVE_P (stmt_info) = false;
6277                     break;
6278                   }
6279               }
6280           /* We have to verify whether we can insert the lane extract
6281              before all uses.  The following is a conservative approximation.
6282              We cannot put this into vectorizable_live_operation because
6283              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6284              doesn't work.
6285              Note that while the fact that we emit code for loads at the
6286              first load should make this a non-problem leafs we construct
6287              from scalars are vectorized after the last scalar def.
6288              ???  If we'd actually compute the insert location during
6289              analysis we could use sth less conservative than the last
6290              scalar stmt in the node for the dominance check.  */
6291           /* ???  What remains is "live" uses in vector CTORs in the same
6292              SLP graph which is where those uses can end up code-generated
6293              right after their definition instead of close to their original
6294              use.  But that would restrict us to code-generate lane-extracts
6295              from the latest stmt in a node.  So we compensate for this
6296              during code-generation, simply not replacing uses for those
6297              hopefully rare cases.  */
6298           if (STMT_VINFO_LIVE_P (stmt_info))
6299             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6300               if (!is_gimple_debug (use_stmt)
6301                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6302                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6303                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6304                 {
6305                   if (dump_enabled_p ())
6306                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6307                                      "Cannot determine insertion place for "
6308                                      "lane extract\n");
6309                   STMT_VINFO_LIVE_P (stmt_info) = false;
6310                   mark_visited = true;
6311                 }
6312         }
6313       if (mark_visited)
6314         svisited.add (stmt_info);
6315     }
6316
6317   slp_tree child;
6318   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6319     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6320       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6321                                    cost_vec, svisited, visited);
6322 }
6323
6324 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6325
6326 static bool
6327 vectorizable_bb_reduc_epilogue (slp_instance instance,
6328                                 stmt_vector_for_cost *cost_vec)
6329 {
6330   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6331   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6332   if (reduc_code == MINUS_EXPR)
6333     reduc_code = PLUS_EXPR;
6334   internal_fn reduc_fn;
6335   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6336   if (!vectype
6337       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6338       || reduc_fn == IFN_LAST
6339       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6340       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6341                                      TREE_TYPE (vectype)))
6342     return false;
6343
6344   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6345      cost log2 vector operations plus shuffles and one extraction.  */
6346   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6347   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6348                     vectype, 0, vect_body);
6349   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6350                     vectype, 0, vect_body);
6351   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6352                     vectype, 0, vect_body);
6353   return true;
6354 }
6355
6356 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6357    and recurse to children.  */
6358
6359 static void
6360 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6361                               hash_set<slp_tree> &visited)
6362 {
6363   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6364       || visited.add (node))
6365     return;
6366
6367   stmt_vec_info stmt;
6368   unsigned i;
6369   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6370     roots.remove (vect_orig_stmt (stmt));
6371
6372   slp_tree child;
6373   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6374     if (child)
6375       vect_slp_prune_covered_roots (child, roots, visited);
6376 }
6377
6378 /* Analyze statements in SLP instances of VINFO.  Return true if the
6379    operations are supported. */
6380
6381 bool
6382 vect_slp_analyze_operations (vec_info *vinfo)
6383 {
6384   slp_instance instance;
6385   int i;
6386
6387   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6388
6389   hash_set<slp_tree> visited;
6390   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6391     {
6392       auto_vec<slp_tree> visited_vec;
6393       stmt_vector_for_cost cost_vec;
6394       cost_vec.create (2);
6395       if (is_a <bb_vec_info> (vinfo))
6396         vect_location = instance->location ();
6397       if (!vect_slp_analyze_node_operations (vinfo,
6398                                              SLP_INSTANCE_TREE (instance),
6399                                              instance, visited, visited_vec,
6400                                              &cost_vec)
6401           /* CTOR instances require vectorized defs for the SLP tree root.  */
6402           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6403               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6404                   != vect_internal_def
6405                   /* Make sure we vectorized with the expected type.  */
6406                   || !useless_type_conversion_p
6407                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6408                                               (instance->root_stmts[0]->stmt))),
6409                          TREE_TYPE (SLP_TREE_VECTYPE
6410                                             (SLP_INSTANCE_TREE (instance))))))
6411           /* Check we can vectorize the reduction.  */
6412           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6413               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6414         {
6415           slp_tree node = SLP_INSTANCE_TREE (instance);
6416           stmt_vec_info stmt_info;
6417           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6418             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6419           else
6420             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6421           if (dump_enabled_p ())
6422             dump_printf_loc (MSG_NOTE, vect_location,
6423                              "removing SLP instance operations starting from: %G",
6424                              stmt_info->stmt);
6425           vect_free_slp_instance (instance);
6426           vinfo->slp_instances.ordered_remove (i);
6427           cost_vec.release ();
6428           while (!visited_vec.is_empty ())
6429             visited.remove (visited_vec.pop ());
6430         }
6431       else
6432         {
6433           i++;
6434           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6435             {
6436               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6437               cost_vec.release ();
6438             }
6439           else
6440             /* For BB vectorization remember the SLP graph entry
6441                cost for later.  */
6442             instance->cost_vec = cost_vec;
6443         }
6444     }
6445
6446   /* Now look for SLP instances with a root that are covered by other
6447      instances and remove them.  */
6448   hash_set<stmt_vec_info> roots;
6449   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6450     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6451       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6452   if (!roots.is_empty ())
6453     {
6454       visited.empty ();
6455       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6456         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6457                                       visited);
6458       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6459         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6460             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6461           {
6462             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6463             if (dump_enabled_p ())
6464               dump_printf_loc (MSG_NOTE, vect_location,
6465                                "removing SLP instance operations starting "
6466                                "from: %G", root->stmt);
6467             vect_free_slp_instance (instance);
6468             vinfo->slp_instances.ordered_remove (i);
6469           }
6470         else
6471           ++i;
6472     }
6473
6474   /* Compute vectorizable live stmts.  */
6475   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6476     {
6477       hash_set<stmt_vec_info> svisited;
6478       hash_set<slp_tree> visited;
6479       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6480         {
6481           vect_location = instance->location ();
6482           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6483                                        instance, &instance->cost_vec, svisited,
6484                                        visited);
6485         }
6486     }
6487
6488   return !vinfo->slp_instances.is_empty ();
6489 }
6490
6491 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6492    closing the eventual chain.  */
6493
6494 static slp_instance
6495 get_ultimate_leader (slp_instance instance,
6496                      hash_map<slp_instance, slp_instance> &instance_leader)
6497 {
6498   auto_vec<slp_instance *, 8> chain;
6499   slp_instance *tem;
6500   while (*(tem = instance_leader.get (instance)) != instance)
6501     {
6502       chain.safe_push (tem);
6503       instance = *tem;
6504     }
6505   while (!chain.is_empty ())
6506     *chain.pop () = instance;
6507   return instance;
6508 }
6509
6510 namespace {
6511 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6512    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6513    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6514
6515    INSTANCE_LEADER is as for get_ultimate_leader.  */
6516
6517 template<typename T>
6518 bool
6519 vect_map_to_instance (slp_instance instance, T key,
6520                       hash_map<T, slp_instance> &key_to_instance,
6521                       hash_map<slp_instance, slp_instance> &instance_leader)
6522 {
6523   bool existed_p;
6524   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6525   if (!existed_p)
6526     ;
6527   else if (key_instance != instance)
6528     {
6529       /* If we're running into a previously marked key make us the
6530          leader of the current ultimate leader.  This keeps the
6531          leader chain acyclic and works even when the current instance
6532          connects two previously independent graph parts.  */
6533       slp_instance key_leader
6534         = get_ultimate_leader (key_instance, instance_leader);
6535       if (key_leader != instance)
6536         instance_leader.put (key_leader, instance);
6537     }
6538   key_instance = instance;
6539   return existed_p;
6540 }
6541 }
6542
6543 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6544
6545 static void
6546 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6547                            slp_instance instance, slp_tree node,
6548                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6549                            hash_map<slp_tree, slp_instance> &node_to_instance,
6550                            hash_map<slp_instance, slp_instance> &instance_leader)
6551 {
6552   stmt_vec_info stmt_info;
6553   unsigned i;
6554
6555   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6556     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6557                           instance_leader);
6558
6559   if (vect_map_to_instance (instance, node, node_to_instance,
6560                             instance_leader))
6561     return;
6562
6563   slp_tree child;
6564   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6565     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6566       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6567                                  node_to_instance, instance_leader);
6568 }
6569
6570 /* Partition the SLP graph into pieces that can be costed independently.  */
6571
6572 static void
6573 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6574 {
6575   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6576
6577   /* First walk the SLP graph assigning each involved scalar stmt a
6578      corresponding SLP graph entry and upon visiting a previously
6579      marked stmt, make the stmts leader the current SLP graph entry.  */
6580   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6581   hash_map<slp_tree, slp_instance> node_to_instance;
6582   hash_map<slp_instance, slp_instance> instance_leader;
6583   slp_instance instance;
6584   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6585     {
6586       instance_leader.put (instance, instance);
6587       vect_bb_partition_graph_r (bb_vinfo,
6588                                  instance, SLP_INSTANCE_TREE (instance),
6589                                  stmt_to_instance, node_to_instance,
6590                                  instance_leader);
6591     }
6592
6593   /* Then collect entries to each independent subgraph.  */
6594   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6595     {
6596       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6597       leader->subgraph_entries.safe_push (instance);
6598       if (dump_enabled_p ()
6599           && leader != instance)
6600         dump_printf_loc (MSG_NOTE, vect_location,
6601                          "instance %p is leader of %p\n",
6602                          (void *) leader, (void *) instance);
6603     }
6604 }
6605
6606 /* Compute the set of scalar stmts participating in internal and external
6607    nodes.  */
6608
6609 static void
6610 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6611                                          hash_set<slp_tree> &visited,
6612                                          hash_set<stmt_vec_info> &vstmts,
6613                                          hash_set<stmt_vec_info> &estmts)
6614 {
6615   int i;
6616   stmt_vec_info stmt_info;
6617   slp_tree child;
6618
6619   if (visited.add (node))
6620     return;
6621
6622   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6623     {
6624       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6625         vstmts.add (stmt_info);
6626
6627       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6628         if (child)
6629           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6630                                                    vstmts, estmts);
6631     }
6632   else
6633     for (tree def : SLP_TREE_SCALAR_OPS (node))
6634       {
6635         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6636         if (def_stmt)
6637           estmts.add (def_stmt);
6638       }
6639 }
6640
6641
6642 /* Compute the scalar cost of the SLP node NODE and its children
6643    and return it.  Do not account defs that are marked in LIFE and
6644    update LIFE according to uses of NODE.  */
6645
6646 static void
6647 vect_bb_slp_scalar_cost (vec_info *vinfo,
6648                          slp_tree node, vec<bool, va_heap> *life,
6649                          stmt_vector_for_cost *cost_vec,
6650                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6651                          hash_set<slp_tree> &visited)
6652 {
6653   unsigned i;
6654   stmt_vec_info stmt_info;
6655   slp_tree child;
6656
6657   if (visited.add (node))
6658     return;
6659
6660   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6661     {
6662       ssa_op_iter op_iter;
6663       def_operand_p def_p;
6664
6665       if ((*life)[i])
6666         continue;
6667
6668       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6669       gimple *orig_stmt = orig_stmt_info->stmt;
6670
6671       /* If there is a non-vectorized use of the defs then the scalar
6672          stmt is kept live in which case we do not account it or any
6673          required defs in the SLP children in the scalar cost.  This
6674          way we make the vectorization more costly when compared to
6675          the scalar cost.  */
6676       if (!STMT_VINFO_LIVE_P (stmt_info))
6677         {
6678           auto_vec<gimple *, 8> worklist;
6679           hash_set<gimple *> *worklist_visited = NULL;
6680           worklist.quick_push (orig_stmt);
6681           do
6682             {
6683               gimple *work_stmt = worklist.pop ();
6684               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6685                 {
6686                   imm_use_iterator use_iter;
6687                   gimple *use_stmt;
6688                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6689                                          DEF_FROM_PTR (def_p))
6690                     if (!is_gimple_debug (use_stmt))
6691                       {
6692                         stmt_vec_info use_stmt_info
6693                           = vinfo->lookup_stmt (use_stmt);
6694                         if (!use_stmt_info
6695                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6696                           {
6697                             if (use_stmt_info
6698                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6699                               {
6700                                 /* For stmts participating in patterns we have
6701                                    to check its uses recursively.  */
6702                                 if (!worklist_visited)
6703                                   worklist_visited = new hash_set<gimple *> ();
6704                                 if (!worklist_visited->add (use_stmt))
6705                                   worklist.safe_push (use_stmt);
6706                                 continue;
6707                               }
6708                             (*life)[i] = true;
6709                             goto next_lane;
6710                           }
6711                       }
6712                 }
6713             }
6714           while (!worklist.is_empty ());
6715 next_lane:
6716           if (worklist_visited)
6717             delete worklist_visited;
6718           if ((*life)[i])
6719             continue;
6720         }
6721
6722       /* Count scalar stmts only once.  */
6723       if (gimple_visited_p (orig_stmt))
6724         continue;
6725       gimple_set_visited (orig_stmt, true);
6726
6727       vect_cost_for_stmt kind;
6728       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6729         {
6730           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6731             kind = scalar_load;
6732           else
6733             kind = scalar_store;
6734         }
6735       else if (vect_nop_conversion_p (orig_stmt_info))
6736         continue;
6737       /* For single-argument PHIs assume coalescing which means zero cost
6738          for the scalar and the vector PHIs.  This avoids artificially
6739          favoring the vector path (but may pessimize it in some cases).  */
6740       else if (is_a <gphi *> (orig_stmt_info->stmt)
6741                && gimple_phi_num_args
6742                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6743         continue;
6744       else
6745         kind = scalar_stmt;
6746       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6747                         SLP_TREE_VECTYPE (node), 0, vect_body);
6748     }
6749
6750   auto_vec<bool, 20> subtree_life;
6751   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6752     {
6753       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6754         {
6755           /* Do not directly pass LIFE to the recursive call, copy it to
6756              confine changes in the callee to the current child/subtree.  */
6757           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6758             {
6759               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6760               for (unsigned j = 0;
6761                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6762                 {
6763                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6764                   if (perm.first == i)
6765                     subtree_life[perm.second] = (*life)[j];
6766                 }
6767             }
6768           else
6769             {
6770               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6771               subtree_life.safe_splice (*life);
6772             }
6773           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6774                                    vectorized_scalar_stmts, visited);
6775           subtree_life.truncate (0);
6776         }
6777     }
6778 }
6779
6780 /* Comparator for the loop-index sorted cost vectors.  */
6781
6782 static int
6783 li_cost_vec_cmp (const void *a_, const void *b_)
6784 {
6785   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6786   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6787   if (a->first < b->first)
6788     return -1;
6789   else if (a->first == b->first)
6790     return 0;
6791   return 1;
6792 }
6793
6794 /* Check if vectorization of the basic block is profitable for the
6795    subgraph denoted by SLP_INSTANCES.  */
6796
6797 static bool
6798 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6799                                     vec<slp_instance> slp_instances,
6800                                     loop_p orig_loop)
6801 {
6802   slp_instance instance;
6803   int i;
6804   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6805   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6806
6807   if (dump_enabled_p ())
6808     {
6809       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6810       hash_set<slp_tree> visited;
6811       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6812         vect_print_slp_graph (MSG_NOTE, vect_location,
6813                               SLP_INSTANCE_TREE (instance), visited);
6814     }
6815
6816   /* Compute the set of scalar stmts we know will go away 'locally' when
6817      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6818      not accurate for nodes promoted extern late or for scalar stmts that
6819      are used both in extern defs and in vectorized defs.  */
6820   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6821   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6822   hash_set<slp_tree> visited;
6823   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6824     {
6825       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6826                                                SLP_INSTANCE_TREE (instance),
6827                                                visited,
6828                                                vectorized_scalar_stmts,
6829                                                scalar_stmts_in_externs);
6830       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6831         vectorized_scalar_stmts.add (rstmt);
6832     }
6833   /* Scalar stmts used as defs in external nodes need to be preseved, so
6834      remove them from vectorized_scalar_stmts.  */
6835   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6836     vectorized_scalar_stmts.remove (stmt);
6837
6838   /* Calculate scalar cost and sum the cost for the vector stmts
6839      previously collected.  */
6840   stmt_vector_for_cost scalar_costs = vNULL;
6841   stmt_vector_for_cost vector_costs = vNULL;
6842   visited.empty ();
6843   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6844     {
6845       auto_vec<bool, 20> life;
6846       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6847                               true);
6848       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6849         record_stmt_cost (&scalar_costs,
6850                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6851                           scalar_stmt,
6852                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6853       vect_bb_slp_scalar_cost (bb_vinfo,
6854                                SLP_INSTANCE_TREE (instance),
6855                                &life, &scalar_costs, vectorized_scalar_stmts,
6856                                visited);
6857       vector_costs.safe_splice (instance->cost_vec);
6858       instance->cost_vec.release ();
6859     }
6860
6861   if (dump_enabled_p ())
6862     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6863
6864   /* When costing non-loop vectorization we need to consider each covered
6865      loop independently and make sure vectorization is profitable.  For
6866      now we assume a loop may be not entered or executed an arbitrary
6867      number of iterations (???  static information can provide more
6868      precise info here) which means we can simply cost each containing
6869      loops stmts separately.  */
6870
6871   /* First produce cost vectors sorted by loop index.  */
6872   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6873     li_scalar_costs (scalar_costs.length ());
6874   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6875     li_vector_costs (vector_costs.length ());
6876   stmt_info_for_cost *cost;
6877   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6878     {
6879       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6880       li_scalar_costs.quick_push (std::make_pair (l, cost));
6881     }
6882   /* Use a random used loop as fallback in case the first vector_costs
6883      entry does not have a stmt_info associated with it.  */
6884   unsigned l = li_scalar_costs[0].first;
6885   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6886     {
6887       /* We inherit from the previous COST, invariants, externals and
6888          extracts immediately follow the cost for the related stmt.  */
6889       if (cost->stmt_info)
6890         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6891       li_vector_costs.quick_push (std::make_pair (l, cost));
6892     }
6893   li_scalar_costs.qsort (li_cost_vec_cmp);
6894   li_vector_costs.qsort (li_cost_vec_cmp);
6895
6896   /* Now cost the portions individually.  */
6897   unsigned vi = 0;
6898   unsigned si = 0;
6899   bool profitable = true;
6900   while (si < li_scalar_costs.length ()
6901          && vi < li_vector_costs.length ())
6902     {
6903       unsigned sl = li_scalar_costs[si].first;
6904       unsigned vl = li_vector_costs[vi].first;
6905       if (sl != vl)
6906         {
6907           if (dump_enabled_p ())
6908             dump_printf_loc (MSG_NOTE, vect_location,
6909                              "Scalar %d and vector %d loop part do not "
6910                              "match up, skipping scalar part\n", sl, vl);
6911           /* Skip the scalar part, assuming zero cost on the vector side.  */
6912           do
6913             {
6914               si++;
6915             }
6916           while (si < li_scalar_costs.length ()
6917                  && li_scalar_costs[si].first == sl);
6918           continue;
6919         }
6920
6921       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6922       do
6923         {
6924           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6925           si++;
6926         }
6927       while (si < li_scalar_costs.length ()
6928              && li_scalar_costs[si].first == sl);
6929       unsigned dummy;
6930       finish_cost (scalar_target_cost_data, nullptr,
6931                    &dummy, &scalar_cost, &dummy);
6932
6933       /* Complete the target-specific vector cost calculation.  */
6934       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6935       do
6936         {
6937           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6938           vi++;
6939         }
6940       while (vi < li_vector_costs.length ()
6941              && li_vector_costs[vi].first == vl);
6942       finish_cost (vect_target_cost_data, scalar_target_cost_data,
6943                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6944       delete scalar_target_cost_data;
6945       delete vect_target_cost_data;
6946
6947       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6948
6949       if (dump_enabled_p ())
6950         {
6951           dump_printf_loc (MSG_NOTE, vect_location,
6952                            "Cost model analysis for part in loop %d:\n", sl);
6953           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
6954                        vec_inside_cost + vec_outside_cost);
6955           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
6956         }
6957
6958       /* Vectorization is profitable if its cost is more than the cost of scalar
6959          version.  Note that we err on the vector side for equal cost because
6960          the cost estimate is otherwise quite pessimistic (constant uses are
6961          free on the scalar side but cost a load on the vector side for
6962          example).  */
6963       if (vec_outside_cost + vec_inside_cost > scalar_cost)
6964         {
6965           profitable = false;
6966           break;
6967         }
6968     }
6969   if (profitable && vi < li_vector_costs.length ())
6970     {
6971       if (dump_enabled_p ())
6972         dump_printf_loc (MSG_NOTE, vect_location,
6973                          "Excess vector cost for part in loop %d:\n",
6974                          li_vector_costs[vi].first);
6975       profitable = false;
6976     }
6977
6978   /* Unset visited flag.  This is delayed when the subgraph is profitable
6979      and we process the loop for remaining unvectorized if-converted code.  */
6980   if (!orig_loop || !profitable)
6981     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6982       gimple_set_visited  (cost->stmt_info->stmt, false);
6983
6984   scalar_costs.release ();
6985   vector_costs.release ();
6986
6987   return profitable;
6988 }
6989
6990 /* qsort comparator for lane defs.  */
6991
6992 static int
6993 vld_cmp (const void *a_, const void *b_)
6994 {
6995   auto *a = (const std::pair<unsigned, tree> *)a_;
6996   auto *b = (const std::pair<unsigned, tree> *)b_;
6997   return a->first - b->first;
6998 }
6999
7000 /* Return true if USE_STMT is a vector lane insert into VEC and set
7001    *THIS_LANE to the lane number that is set.  */
7002
7003 static bool
7004 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7005 {
7006   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7007   if (!use_ass
7008       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7009       || (vec
7010           ? gimple_assign_rhs1 (use_ass) != vec
7011           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7012       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7013                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7014       || !constant_multiple_p
7015             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7016              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7017              this_lane))
7018     return false;
7019   return true;
7020 }
7021
7022 /* Find any vectorizable constructors and add them to the grouped_store
7023    array.  */
7024
7025 static void
7026 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
7027 {
7028   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7029     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7030          !gsi_end_p (gsi); gsi_next (&gsi))
7031     {
7032       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7033       if (!assign)
7034         continue;
7035
7036       tree rhs = gimple_assign_rhs1 (assign);
7037       enum tree_code code = gimple_assign_rhs_code (assign);
7038       use_operand_p use_p;
7039       gimple *use_stmt;
7040       if (code == CONSTRUCTOR)
7041         {
7042           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7043               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7044                            CONSTRUCTOR_NELTS (rhs))
7045               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7046               || uniform_vector_p (rhs))
7047             continue;
7048
7049           unsigned j;
7050           tree val;
7051           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7052               if (TREE_CODE (val) != SSA_NAME
7053                   || !bb_vinfo->lookup_def (val))
7054                 break;
7055           if (j != CONSTRUCTOR_NELTS (rhs))
7056             continue;
7057
7058           stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7059           BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7060         }
7061       else if (code == BIT_INSERT_EXPR
7062                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7063                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7064                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7065                && integer_zerop (gimple_assign_rhs3 (assign))
7066                && useless_type_conversion_p
7067                     (TREE_TYPE (TREE_TYPE (rhs)),
7068                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7069                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7070         {
7071           /* We start to match on insert to lane zero but since the
7072              inserts need not be ordered we'd have to search both
7073              the def and the use chains.  */
7074           tree vectype = TREE_TYPE (rhs);
7075           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7076           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7077           auto_sbitmap lanes (nlanes);
7078           bitmap_clear (lanes);
7079           bitmap_set_bit (lanes, 0);
7080           tree def = gimple_assign_lhs (assign);
7081           lane_defs.quick_push
7082                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7083           unsigned lanes_found = 1;
7084           /* Start with the use chains, the last stmt will be the root.  */
7085           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7086           vec<stmt_vec_info> roots = vNULL;
7087           roots.safe_push (last);
7088           do
7089             {
7090               use_operand_p use_p;
7091               gimple *use_stmt;
7092               if (!single_imm_use (def, &use_p, &use_stmt))
7093                 break;
7094               unsigned this_lane;
7095               if (!bb_vinfo->lookup_stmt (use_stmt)
7096                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7097                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7098                 break;
7099               if (bitmap_bit_p (lanes, this_lane))
7100                 break;
7101               lanes_found++;
7102               bitmap_set_bit (lanes, this_lane);
7103               gassign *use_ass = as_a <gassign *> (use_stmt);
7104               lane_defs.quick_push (std::make_pair
7105                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7106               last = bb_vinfo->lookup_stmt (use_ass);
7107               roots.safe_push (last);
7108               def = gimple_assign_lhs (use_ass);
7109             }
7110           while (lanes_found < nlanes);
7111           if (roots.length () > 1)
7112             std::swap(roots[0], roots[roots.length () - 1]);
7113           if (lanes_found < nlanes)
7114             {
7115               /* Now search the def chain.  */
7116               def = gimple_assign_rhs1 (assign);
7117               do
7118                 {
7119                   if (TREE_CODE (def) != SSA_NAME
7120                       || !has_single_use (def))
7121                     break;
7122                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7123                   unsigned this_lane;
7124                   if (!bb_vinfo->lookup_stmt (def_stmt)
7125                       || !vect_slp_is_lane_insert (def_stmt,
7126                                                    NULL_TREE, &this_lane)
7127                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7128                     break;
7129                   if (bitmap_bit_p (lanes, this_lane))
7130                     break;
7131                   lanes_found++;
7132                   bitmap_set_bit (lanes, this_lane);
7133                   lane_defs.quick_push (std::make_pair
7134                                           (this_lane,
7135                                            gimple_assign_rhs2 (def_stmt)));
7136                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7137                   def = gimple_assign_rhs1 (def_stmt);
7138                 }
7139               while (lanes_found < nlanes);
7140             }
7141           if (lanes_found == nlanes)
7142             {
7143               /* Sort lane_defs after the lane index and register the root.  */
7144               lane_defs.qsort (vld_cmp);
7145               vec<stmt_vec_info> stmts;
7146               stmts.create (nlanes);
7147               for (unsigned i = 0; i < nlanes; ++i)
7148                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7149               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7150                                                    stmts, roots));
7151             }
7152           else
7153             roots.release ();
7154         }
7155       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7156                && (associative_tree_code (code) || code == MINUS_EXPR)
7157                /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
7158                   checks pessimize a two-element reduction.  PR54400.
7159                   ???  In-order reduction could be handled if we only
7160                   traverse one operand chain in vect_slp_linearize_chain.  */
7161                && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7162                    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7163                        && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7164                /* Ops with constants at the tail can be stripped here.  */
7165                && TREE_CODE (rhs) == SSA_NAME
7166                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7167                /* Should be the chain end.  */
7168                && (!single_imm_use (gimple_assign_lhs (assign),
7169                                     &use_p, &use_stmt)
7170                    || !is_gimple_assign (use_stmt)
7171                    || (gimple_assign_rhs_code (use_stmt) != code
7172                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7173                            || (gimple_assign_rhs_code (use_stmt)
7174                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7175         {
7176           /* We start the match at the end of a possible association
7177              chain.  */
7178           auto_vec<chain_op_t> chain;
7179           auto_vec<std::pair<tree_code, gimple *> > worklist;
7180           auto_vec<gimple *> chain_stmts;
7181           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7182           if (code == MINUS_EXPR)
7183             code = PLUS_EXPR;
7184           internal_fn reduc_fn;
7185           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7186               || reduc_fn == IFN_LAST)
7187             continue;
7188           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7189                                     /* ??? */
7190                                     code_stmt, alt_code_stmt, &chain_stmts);
7191           if (chain.length () > 1)
7192             {
7193               /* Sort the chain according to def_type and operation.  */
7194               chain.sort (dt_sort_cmp, bb_vinfo);
7195               /* ???  Now we'd want to strip externals and constants
7196                  but record those to be handled in the epilogue.  */
7197               /* ???  For now do not allow mixing ops or externs/constants.  */
7198               bool invalid = false;
7199               for (unsigned i = 0; i < chain.length (); ++i)
7200                 if (chain[i].dt != vect_internal_def
7201                     || chain[i].code != code)
7202                   invalid = true;
7203               if (!invalid)
7204                 {
7205                   vec<stmt_vec_info> stmts;
7206                   stmts.create (chain.length ());
7207                   for (unsigned i = 0; i < chain.length (); ++i)
7208                     stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7209                   vec<stmt_vec_info> roots;
7210                   roots.create (chain_stmts.length ());
7211                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7212                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7213                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7214                                                        stmts, roots));
7215                 }
7216             }
7217         }
7218     }
7219 }
7220
7221 /* Walk the grouped store chains and replace entries with their
7222    pattern variant if any.  */
7223
7224 static void
7225 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7226 {
7227   stmt_vec_info first_element;
7228   unsigned i;
7229
7230   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7231     {
7232       /* We also have CTORs in this array.  */
7233       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7234         continue;
7235       if (STMT_VINFO_IN_PATTERN_P (first_element))
7236         {
7237           stmt_vec_info orig = first_element;
7238           first_element = STMT_VINFO_RELATED_STMT (first_element);
7239           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7240           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7241           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7242           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7243           vinfo->grouped_stores[i] = first_element;
7244         }
7245       stmt_vec_info prev = first_element;
7246       while (DR_GROUP_NEXT_ELEMENT (prev))
7247         {
7248           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7249           if (STMT_VINFO_IN_PATTERN_P (elt))
7250             {
7251               stmt_vec_info orig = elt;
7252               elt = STMT_VINFO_RELATED_STMT (elt);
7253               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7254               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7255               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7256             }
7257           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7258           prev = elt;
7259         }
7260     }
7261 }
7262
7263 /* Check if the region described by BB_VINFO can be vectorized, returning
7264    true if so.  When returning false, set FATAL to true if the same failure
7265    would prevent vectorization at other vector sizes, false if it is still
7266    worth trying other sizes.  N_STMTS is the number of statements in the
7267    region.  */
7268
7269 static bool
7270 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7271                        vec<int> *dataref_groups)
7272 {
7273   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7274
7275   slp_instance instance;
7276   int i;
7277   poly_uint64 min_vf = 2;
7278
7279   /* The first group of checks is independent of the vector size.  */
7280   fatal = true;
7281
7282   /* Analyze the data references.  */
7283
7284   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7285     {
7286       if (dump_enabled_p ())
7287         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7288                          "not vectorized: unhandled data-ref in basic "
7289                          "block.\n");
7290       return false;
7291     }
7292
7293   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7294     {
7295      if (dump_enabled_p ())
7296        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7297                         "not vectorized: unhandled data access in "
7298                         "basic block.\n");
7299       return false;
7300     }
7301
7302   vect_slp_check_for_constructors (bb_vinfo);
7303
7304   /* If there are no grouped stores and no constructors in the region
7305      there is no need to continue with pattern recog as vect_analyze_slp
7306      will fail anyway.  */
7307   if (bb_vinfo->grouped_stores.is_empty ()
7308       && bb_vinfo->roots.is_empty ())
7309     {
7310       if (dump_enabled_p ())
7311         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312                          "not vectorized: no grouped stores in "
7313                          "basic block.\n");
7314       return false;
7315     }
7316
7317   /* While the rest of the analysis below depends on it in some way.  */
7318   fatal = false;
7319
7320   vect_pattern_recog (bb_vinfo);
7321
7322   /* Update store groups from pattern processing.  */
7323   vect_fixup_store_groups_with_patterns (bb_vinfo);
7324
7325   /* Check the SLP opportunities in the basic block, analyze and build SLP
7326      trees.  */
7327   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7328     {
7329       if (dump_enabled_p ())
7330         {
7331           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7332                            "Failed to SLP the basic block.\n");
7333           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7334                            "not vectorized: failed to find SLP opportunities "
7335                            "in basic block.\n");
7336         }
7337       return false;
7338     }
7339
7340   /* Optimize permutations.  */
7341   vect_optimize_slp (bb_vinfo);
7342
7343   /* Gather the loads reachable from the SLP graph entries.  */
7344   vect_gather_slp_loads (bb_vinfo);
7345
7346   vect_record_base_alignments (bb_vinfo);
7347
7348   /* Analyze and verify the alignment of data references and the
7349      dependence in the SLP instances.  */
7350   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7351     {
7352       vect_location = instance->location ();
7353       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7354           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7355         {
7356           slp_tree node = SLP_INSTANCE_TREE (instance);
7357           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7358           if (dump_enabled_p ())
7359             dump_printf_loc (MSG_NOTE, vect_location,
7360                              "removing SLP instance operations starting from: %G",
7361                              stmt_info->stmt);
7362           vect_free_slp_instance (instance);
7363           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7364           continue;
7365         }
7366
7367       /* Mark all the statements that we want to vectorize as pure SLP and
7368          relevant.  */
7369       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7370       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7371       unsigned j;
7372       stmt_vec_info root;
7373       /* Likewise consider instance root stmts as vectorized.  */
7374       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7375         STMT_SLP_TYPE (root) = pure_slp;
7376
7377       i++;
7378     }
7379   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7380     return false;
7381
7382   if (!vect_slp_analyze_operations (bb_vinfo))
7383     {
7384       if (dump_enabled_p ())
7385         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7386                          "not vectorized: bad operation in basic block.\n");
7387       return false;
7388     }
7389
7390   vect_bb_partition_graph (bb_vinfo);
7391
7392   return true;
7393 }
7394
7395 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7396    basic blocks in BBS, returning true on success.
7397    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7398
7399 static bool
7400 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7401                  vec<int> *dataref_groups, unsigned int n_stmts,
7402                  loop_p orig_loop)
7403 {
7404   bb_vec_info bb_vinfo;
7405   auto_vector_modes vector_modes;
7406
7407   /* Autodetect first vector size we try.  */
7408   machine_mode next_vector_mode = VOIDmode;
7409   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7410   unsigned int mode_i = 0;
7411
7412   vec_info_shared shared;
7413
7414   machine_mode autodetected_vector_mode = VOIDmode;
7415   while (1)
7416     {
7417       bool vectorized = false;
7418       bool fatal = false;
7419       bb_vinfo = new _bb_vec_info (bbs, &shared);
7420
7421       bool first_time_p = shared.datarefs.is_empty ();
7422       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7423       if (first_time_p)
7424         bb_vinfo->shared->save_datarefs ();
7425       else
7426         bb_vinfo->shared->check_datarefs ();
7427       bb_vinfo->vector_mode = next_vector_mode;
7428
7429       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7430         {
7431           if (dump_enabled_p ())
7432             {
7433               dump_printf_loc (MSG_NOTE, vect_location,
7434                                "***** Analysis succeeded with vector mode"
7435                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7436               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7437             }
7438
7439           bb_vinfo->shared->check_datarefs ();
7440
7441           auto_vec<slp_instance> profitable_subgraphs;
7442           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7443             {
7444               if (instance->subgraph_entries.is_empty ())
7445                 continue;
7446
7447               vect_location = instance->location ();
7448               if (!unlimited_cost_model (NULL)
7449                   && !vect_bb_vectorization_profitable_p
7450                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7451                 {
7452                   if (dump_enabled_p ())
7453                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7454                                      "not vectorized: vectorization is not "
7455                                      "profitable.\n");
7456                   continue;
7457                 }
7458
7459               if (!dbg_cnt (vect_slp))
7460                 continue;
7461
7462               profitable_subgraphs.safe_push (instance);
7463             }
7464
7465           /* When we're vectorizing an if-converted loop body make sure
7466              we vectorized all if-converted code.  */
7467           if (!profitable_subgraphs.is_empty ()
7468               && orig_loop)
7469             {
7470               gcc_assert (bb_vinfo->bbs.length () == 1);
7471               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7472                    !gsi_end_p (gsi); gsi_next (&gsi))
7473                 {
7474                   /* The costing above left us with DCEable vectorized scalar
7475                      stmts having the visited flag set on profitable
7476                      subgraphs.  Do the delayed clearing of the flag here.  */
7477                   if (gimple_visited_p (gsi_stmt (gsi)))
7478                     {
7479                       gimple_set_visited (gsi_stmt (gsi), false);
7480                       continue;
7481                     }
7482                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7483                     continue;
7484
7485                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7486                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7487                       {
7488                         if (!profitable_subgraphs.is_empty ()
7489                             && dump_enabled_p ())
7490                           dump_printf_loc (MSG_NOTE, vect_location,
7491                                            "not profitable because of "
7492                                            "unprofitable if-converted scalar "
7493                                            "code\n");
7494                         profitable_subgraphs.truncate (0);
7495                       }
7496                 }
7497             }
7498
7499           /* Finally schedule the profitable subgraphs.  */
7500           for (slp_instance instance : profitable_subgraphs)
7501             {
7502               if (!vectorized && dump_enabled_p ())
7503                 dump_printf_loc (MSG_NOTE, vect_location,
7504                                  "Basic block will be vectorized "
7505                                  "using SLP\n");
7506               vectorized = true;
7507
7508               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7509
7510               unsigned HOST_WIDE_INT bytes;
7511               if (dump_enabled_p ())
7512                 {
7513                   if (GET_MODE_SIZE
7514                         (bb_vinfo->vector_mode).is_constant (&bytes))
7515                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7516                                      "basic block part vectorized using %wu "
7517                                      "byte vectors\n", bytes);
7518                   else
7519                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7520                                      "basic block part vectorized using "
7521                                      "variable length vectors\n");
7522                 }
7523             }
7524         }
7525       else
7526         {
7527           if (dump_enabled_p ())
7528             dump_printf_loc (MSG_NOTE, vect_location,
7529                              "***** Analysis failed with vector mode %s\n",
7530                              GET_MODE_NAME (bb_vinfo->vector_mode));
7531         }
7532
7533       if (mode_i == 0)
7534         autodetected_vector_mode = bb_vinfo->vector_mode;
7535
7536       if (!fatal)
7537         while (mode_i < vector_modes.length ()
7538                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7539           {
7540             if (dump_enabled_p ())
7541               dump_printf_loc (MSG_NOTE, vect_location,
7542                                "***** The result for vector mode %s would"
7543                                " be the same\n",
7544                                GET_MODE_NAME (vector_modes[mode_i]));
7545             mode_i += 1;
7546           }
7547
7548       delete bb_vinfo;
7549
7550       if (mode_i < vector_modes.length ()
7551           && VECTOR_MODE_P (autodetected_vector_mode)
7552           && (related_vector_mode (vector_modes[mode_i],
7553                                    GET_MODE_INNER (autodetected_vector_mode))
7554               == autodetected_vector_mode)
7555           && (related_vector_mode (autodetected_vector_mode,
7556                                    GET_MODE_INNER (vector_modes[mode_i]))
7557               == vector_modes[mode_i]))
7558         {
7559           if (dump_enabled_p ())
7560             dump_printf_loc (MSG_NOTE, vect_location,
7561                              "***** Skipping vector mode %s, which would"
7562                              " repeat the analysis for %s\n",
7563                              GET_MODE_NAME (vector_modes[mode_i]),
7564                              GET_MODE_NAME (autodetected_vector_mode));
7565           mode_i += 1;
7566         }
7567
7568       if (vectorized
7569           || mode_i == vector_modes.length ()
7570           || autodetected_vector_mode == VOIDmode
7571           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7572              vector sizes will fail do not bother iterating.  */
7573           || fatal)
7574         return vectorized;
7575
7576       /* Try the next biggest vector size.  */
7577       next_vector_mode = vector_modes[mode_i++];
7578       if (dump_enabled_p ())
7579         dump_printf_loc (MSG_NOTE, vect_location,
7580                          "***** Re-trying analysis with vector mode %s\n",
7581                          GET_MODE_NAME (next_vector_mode));
7582     }
7583 }
7584
7585
7586 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7587    true if anything in the basic-block was vectorized.  */
7588
7589 static bool
7590 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7591 {
7592   vec<data_reference_p> datarefs = vNULL;
7593   auto_vec<int> dataref_groups;
7594   int insns = 0;
7595   int current_group = 0;
7596
7597   for (unsigned i = 0; i < bbs.length (); i++)
7598     {
7599       basic_block bb = bbs[i];
7600       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7601            gsi_next (&gsi))
7602         {
7603           gimple *stmt = gsi_stmt (gsi);
7604           if (is_gimple_debug (stmt))
7605             continue;
7606
7607           insns++;
7608
7609           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7610             vect_location = stmt;
7611
7612           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7613                                               &dataref_groups, current_group))
7614             ++current_group;
7615         }
7616       /* New BBs always start a new DR group.  */
7617       ++current_group;
7618     }
7619
7620   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7621 }
7622
7623 /* Special entry for the BB vectorizer.  Analyze and transform a single
7624    if-converted BB with ORIG_LOOPs body being the not if-converted
7625    representation.  Returns true if anything in the basic-block was
7626    vectorized.  */
7627
7628 bool
7629 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7630 {
7631   auto_vec<basic_block> bbs;
7632   bbs.safe_push (bb);
7633   return vect_slp_bbs (bbs, orig_loop);
7634 }
7635
7636 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7637    true if anything in the basic-block was vectorized.  */
7638
7639 bool
7640 vect_slp_function (function *fun)
7641 {
7642   bool r = false;
7643   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7644   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7645
7646   /* For the moment split the function into pieces to avoid making
7647      the iteration on the vector mode moot.  Split at points we know
7648      to not handle well which is CFG merges (SLP discovery doesn't
7649      handle non-loop-header PHIs) and loop exits.  Since pattern
7650      recog requires reverse iteration to visit uses before defs
7651      simply chop RPO into pieces.  */
7652   auto_vec<basic_block> bbs;
7653   for (unsigned i = 0; i < n; i++)
7654     {
7655       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7656       bool split = false;
7657
7658       /* Split when a BB is not dominated by the first block.  */
7659       if (!bbs.is_empty ()
7660           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7661         {
7662           if (dump_enabled_p ())
7663             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664                              "splitting region at dominance boundary bb%d\n",
7665                              bb->index);
7666           split = true;
7667         }
7668       /* Split when the loop determined by the first block
7669          is exited.  This is because we eventually insert
7670          invariants at region begin.  */
7671       else if (!bbs.is_empty ()
7672                && bbs[0]->loop_father != bb->loop_father
7673                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7674         {
7675           if (dump_enabled_p ())
7676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7677                              "splitting region at loop %d exit at bb%d\n",
7678                              bbs[0]->loop_father->num, bb->index);
7679           split = true;
7680         }
7681
7682       if (split && !bbs.is_empty ())
7683         {
7684           r |= vect_slp_bbs (bbs, NULL);
7685           bbs.truncate (0);
7686         }
7687
7688       /* We need to be able to insert at the head of the region which
7689          we cannot for region starting with a returns-twice call.  */
7690       if (bbs.is_empty ())
7691         if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7692           if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7693             {
7694               if (dump_enabled_p ())
7695                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7696                                  "skipping bb%d as start of region as it "
7697                                  "starts with returns-twice call\n",
7698                                  bb->index);
7699               continue;
7700             }
7701
7702       bbs.safe_push (bb);
7703
7704       /* When we have a stmt ending this block and defining a
7705          value we have to insert on edges when inserting after it for
7706          a vector containing its definition.  Avoid this for now.  */
7707       if (gimple *last = *gsi_last_bb (bb))
7708         if (gimple_get_lhs (last)
7709             && is_ctrl_altering_stmt (last))
7710           {
7711             if (dump_enabled_p ())
7712               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713                                "splitting region at control altering "
7714                                "definition %G", last);
7715             r |= vect_slp_bbs (bbs, NULL);
7716             bbs.truncate (0);
7717           }
7718     }
7719
7720   if (!bbs.is_empty ())
7721     r |= vect_slp_bbs (bbs, NULL);
7722
7723   free (rpo);
7724
7725   return r;
7726 }
7727
7728 /* Build a variable-length vector in which the elements in ELTS are repeated
7729    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7730    RESULTS and add any new instructions to SEQ.
7731
7732    The approach we use is:
7733
7734    (1) Find a vector mode VM with integer elements of mode IM.
7735
7736    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7737        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7738        from small vectors to IM.
7739
7740    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7741
7742    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7743        correct byte contents.
7744
7745    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7746
7747    We try to find the largest IM for which this sequence works, in order
7748    to cut down on the number of interleaves.  */
7749
7750 void
7751 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7752                           const vec<tree> &elts, unsigned int nresults,
7753                           vec<tree> &results)
7754 {
7755   unsigned int nelts = elts.length ();
7756   tree element_type = TREE_TYPE (vector_type);
7757
7758   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7759   unsigned int nvectors = 1;
7760   tree new_vector_type;
7761   tree permutes[2];
7762   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7763                                        &nvectors, &new_vector_type,
7764                                        permutes))
7765     gcc_unreachable ();
7766
7767   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7768   unsigned int partial_nelts = nelts / nvectors;
7769   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7770
7771   tree_vector_builder partial_elts;
7772   auto_vec<tree, 32> pieces (nvectors * 2);
7773   pieces.quick_grow_cleared (nvectors * 2);
7774   for (unsigned int i = 0; i < nvectors; ++i)
7775     {
7776       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7777              ELTS' has mode IM.  */
7778       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7779       for (unsigned int j = 0; j < partial_nelts; ++j)
7780         partial_elts.quick_push (elts[i * partial_nelts + j]);
7781       tree t = gimple_build_vector (seq, &partial_elts);
7782       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7783                         TREE_TYPE (new_vector_type), t);
7784
7785       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7786       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7787     }
7788
7789   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7790          correct byte contents.
7791
7792      Conceptually, we need to repeat the following operation log2(nvectors)
7793      times, where hi_start = nvectors / 2:
7794
7795         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7796         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7797
7798      However, if each input repeats every N elements and the VF is
7799      a multiple of N * 2, the HI result is the same as the LO result.
7800      This will be true for the first N1 iterations of the outer loop,
7801      followed by N2 iterations for which both the LO and HI results
7802      are needed.  I.e.:
7803
7804         N1 + N2 = log2(nvectors)
7805
7806      Each "N1 iteration" doubles the number of redundant vectors and the
7807      effect of the process as a whole is to have a sequence of nvectors/2**N1
7808      vectors that repeats 2**N1 times.  Rather than generate these redundant
7809      vectors, we halve the number of vectors for each N1 iteration.  */
7810   unsigned int in_start = 0;
7811   unsigned int out_start = nvectors;
7812   unsigned int new_nvectors = nvectors;
7813   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7814     {
7815       unsigned int hi_start = new_nvectors / 2;
7816       unsigned int out_i = 0;
7817       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7818         {
7819           if ((in_i & 1) != 0
7820               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7821                              2 * in_repeat))
7822             continue;
7823
7824           tree output = make_ssa_name (new_vector_type);
7825           tree input1 = pieces[in_start + (in_i / 2)];
7826           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7827           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7828                                                input1, input2,
7829                                                permutes[in_i & 1]);
7830           gimple_seq_add_stmt (seq, stmt);
7831           pieces[out_start + out_i] = output;
7832           out_i += 1;
7833         }
7834       std::swap (in_start, out_start);
7835       new_nvectors = out_i;
7836     }
7837
7838   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7839   results.reserve (nresults);
7840   for (unsigned int i = 0; i < nresults; ++i)
7841     if (i < new_nvectors)
7842       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7843                                         pieces[in_start + i]));
7844     else
7845       results.quick_push (results[i - new_nvectors]);
7846 }
7847
7848
7849 /* For constant and loop invariant defs in OP_NODE this function creates
7850    vector defs that will be used in the vectorized stmts and stores them
7851    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7852
7853 static void
7854 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7855 {
7856   unsigned HOST_WIDE_INT nunits;
7857   tree vec_cst;
7858   unsigned j, number_of_places_left_in_vector;
7859   tree vector_type;
7860   tree vop;
7861   int group_size = op_node->ops.length ();
7862   unsigned int vec_num, i;
7863   unsigned number_of_copies = 1;
7864   bool constant_p;
7865   gimple_seq ctor_seq = NULL;
7866   auto_vec<tree, 16> permute_results;
7867
7868   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
7869   vector_type = SLP_TREE_VECTYPE (op_node);
7870
7871   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7872   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7873   auto_vec<tree> voprnds (number_of_vectors);
7874
7875   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7876      created vectors. It is greater than 1 if unrolling is performed.
7877
7878      For example, we have two scalar operands, s1 and s2 (e.g., group of
7879      strided accesses of size two), while NUNITS is four (i.e., four scalars
7880      of this type can be packed in a vector).  The output vector will contain
7881      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
7882      will be 2).
7883
7884      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7885      containing the operands.
7886
7887      For example, NUNITS is four as before, and the group size is 8
7888      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
7889      {s5, s6, s7, s8}.  */
7890
7891   /* When using duplicate_and_interleave, we just need one element for
7892      each scalar statement.  */
7893   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7894     nunits = group_size;
7895
7896   number_of_copies = nunits * number_of_vectors / group_size;
7897
7898   number_of_places_left_in_vector = nunits;
7899   constant_p = true;
7900   tree_vector_builder elts (vector_type, nunits, 1);
7901   elts.quick_grow (nunits);
7902   stmt_vec_info insert_after = NULL;
7903   for (j = 0; j < number_of_copies; j++)
7904     {
7905       tree op;
7906       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7907         {
7908           /* Create 'vect_ = {op0,op1,...,opn}'.  */
7909           number_of_places_left_in_vector--;
7910           tree orig_op = op;
7911           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7912             {
7913               if (CONSTANT_CLASS_P (op))
7914                 {
7915                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7916                     {
7917                       /* Can't use VIEW_CONVERT_EXPR for booleans because
7918                          of possibly different sizes of scalar value and
7919                          vector element.  */
7920                       if (integer_zerop (op))
7921                         op = build_int_cst (TREE_TYPE (vector_type), 0);
7922                       else if (integer_onep (op))
7923                         op = build_all_ones_cst (TREE_TYPE (vector_type));
7924                       else
7925                         gcc_unreachable ();
7926                     }
7927                   else
7928                     op = fold_unary (VIEW_CONVERT_EXPR,
7929                                      TREE_TYPE (vector_type), op);
7930                   gcc_assert (op && CONSTANT_CLASS_P (op));
7931                 }
7932               else
7933                 {
7934                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7935                   gimple *init_stmt;
7936                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7937                     {
7938                       tree true_val
7939                         = build_all_ones_cst (TREE_TYPE (vector_type));
7940                       tree false_val
7941                         = build_zero_cst (TREE_TYPE (vector_type));
7942                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7943                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7944                                                        op, true_val,
7945                                                        false_val);
7946                     }
7947                   else
7948                     {
7949                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7950                                    op);
7951                       init_stmt
7952                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7953                                                op);
7954                     }
7955                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
7956                   op = new_temp;
7957                 }
7958             }
7959           elts[number_of_places_left_in_vector] = op;
7960           if (!CONSTANT_CLASS_P (op))
7961             constant_p = false;
7962           /* For BB vectorization we have to compute an insert location
7963              when a def is inside the analyzed region since we cannot
7964              simply insert at the BB start in this case.  */
7965           stmt_vec_info opdef;
7966           if (TREE_CODE (orig_op) == SSA_NAME
7967               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7968               && is_a <bb_vec_info> (vinfo)
7969               && (opdef = vinfo->lookup_def (orig_op)))
7970             {
7971               if (!insert_after)
7972                 insert_after = opdef;
7973               else
7974                 insert_after = get_later_stmt (insert_after, opdef);
7975             }
7976
7977           if (number_of_places_left_in_vector == 0)
7978             {
7979               if (constant_p
7980                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7981                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7982                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7983               else
7984                 {
7985                   if (permute_results.is_empty ())
7986                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7987                                               elts, number_of_vectors,
7988                                               permute_results);
7989                   vec_cst = permute_results[number_of_vectors - j - 1];
7990                 }
7991               if (!gimple_seq_empty_p (ctor_seq))
7992                 {
7993                   if (insert_after)
7994                     {
7995                       gimple_stmt_iterator gsi;
7996                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7997                         {
7998                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7999                           gsi_insert_seq_before (&gsi, ctor_seq,
8000                                                  GSI_CONTINUE_LINKING);
8001                         }
8002                       else if (!stmt_ends_bb_p (insert_after->stmt))
8003                         {
8004                           gsi = gsi_for_stmt (insert_after->stmt);
8005                           gsi_insert_seq_after (&gsi, ctor_seq,
8006                                                 GSI_CONTINUE_LINKING);
8007                         }
8008                       else
8009                         {
8010                           /* When we want to insert after a def where the
8011                              defining stmt throws then insert on the fallthru
8012                              edge.  */
8013                           edge e = find_fallthru_edge
8014                                      (gimple_bb (insert_after->stmt)->succs);
8015                           basic_block new_bb
8016                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8017                           gcc_assert (!new_bb);
8018                         }
8019                     }
8020                   else
8021                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8022                   ctor_seq = NULL;
8023                 }
8024               voprnds.quick_push (vec_cst);
8025               insert_after = NULL;
8026               number_of_places_left_in_vector = nunits;
8027               constant_p = true;
8028               elts.new_vector (vector_type, nunits, 1);
8029               elts.quick_grow (nunits);
8030             }
8031         }
8032     }
8033
8034   /* Since the vectors are created in the reverse order, we should invert
8035      them.  */
8036   vec_num = voprnds.length ();
8037   for (j = vec_num; j != 0; j--)
8038     {
8039       vop = voprnds[j - 1];
8040       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8041     }
8042
8043   /* In case that VF is greater than the unrolling factor needed for the SLP
8044      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8045      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8046      to replicate the vectors.  */
8047   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8048     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8049          i++)
8050       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8051 }
8052
8053 /* Get the Ith vectorized definition from SLP_NODE.  */
8054
8055 tree
8056 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8057 {
8058   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8059     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8060   else
8061     return SLP_TREE_VEC_DEFS (slp_node)[i];
8062 }
8063
8064 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8065
8066 void
8067 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8068 {
8069   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8070   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8071     {
8072       unsigned j;
8073       gimple *vec_def_stmt;
8074       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8075         vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8076     }
8077   else
8078     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8079 }
8080
8081 /* Get N vectorized definitions for SLP_NODE.  */
8082
8083 void
8084 vect_get_slp_defs (vec_info *,
8085                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8086 {
8087   if (n == -1U)
8088     n = SLP_TREE_CHILDREN (slp_node).length ();
8089
8090   for (unsigned i = 0; i < n; ++i)
8091     {
8092       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8093       vec<tree> vec_defs = vNULL;
8094       vect_get_slp_defs (child, &vec_defs);
8095       vec_oprnds->quick_push (vec_defs);
8096     }
8097 }
8098
8099 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8100    - PERM gives the permutation that the caller wants to use for NODE,
8101      which might be different from SLP_LOAD_PERMUTATION.
8102    - DUMP_P controls whether the function dumps information.  */
8103
8104 static bool
8105 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8106                                 load_permutation_t &perm,
8107                                 const vec<tree> &dr_chain,
8108                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8109                                 bool analyze_only, bool dump_p,
8110                                 unsigned *n_perms, unsigned int *n_loads,
8111                                 bool dce_chain)
8112 {
8113   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8114   int vec_index = 0;
8115   tree vectype = SLP_TREE_VECTYPE (node);
8116   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8117   unsigned int mask_element;
8118   machine_mode mode;
8119
8120   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8121     return false;
8122
8123   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8124
8125   mode = TYPE_MODE (vectype);
8126   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8127   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8128
8129   /* Initialize the vect stmts of NODE to properly insert the generated
8130      stmts later.  */
8131   if (! analyze_only)
8132     for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); i < nstmts; i++)
8133       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8134
8135   /* Generate permutation masks for every NODE. Number of masks for each NODE
8136      is equal to GROUP_SIZE.
8137      E.g., we have a group of three nodes with three loads from the same
8138      location in each node, and the vector size is 4. I.e., we have a
8139      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8140      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8141      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8142      ...
8143
8144      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8145      The last mask is illegal since we assume two operands for permute
8146      operation, and the mask element values can't be outside that range.
8147      Hence, the last mask must be converted into {2,5,5,5}.
8148      For the first two permutations we need the first and the second input
8149      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8150      we need the second and the third vectors: {b1,c1,a2,b2} and
8151      {c2,a3,b3,c3}.  */
8152
8153   int vect_stmts_counter = 0;
8154   unsigned int index = 0;
8155   int first_vec_index = -1;
8156   int second_vec_index = -1;
8157   bool noop_p = true;
8158   *n_perms = 0;
8159
8160   vec_perm_builder mask;
8161   unsigned int nelts_to_build;
8162   unsigned int nvectors_per_build;
8163   unsigned int in_nlanes;
8164   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8165                       && multiple_p (nunits, group_size));
8166   if (repeating_p)
8167     {
8168       /* A single vector contains a whole number of copies of the node, so:
8169          (a) all permutes can use the same mask; and
8170          (b) the permutes only need a single vector input.  */
8171       mask.new_vector (nunits, group_size, 3);
8172       nelts_to_build = mask.encoded_nelts ();
8173       /* It's possible to obtain zero nstmts during analyze_only, so make
8174          it at least one to ensure the later computation for n_perms
8175          proceed.  */
8176       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8177       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8178     }
8179   else
8180     {
8181       /* We need to construct a separate mask for each vector statement.  */
8182       unsigned HOST_WIDE_INT const_nunits, const_vf;
8183       if (!nunits.is_constant (&const_nunits)
8184           || !vf.is_constant (&const_vf))
8185         return false;
8186       mask.new_vector (const_nunits, const_nunits, 1);
8187       nelts_to_build = const_vf * group_size;
8188       nvectors_per_build = 1;
8189       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8190     }
8191   auto_sbitmap used_in_lanes (in_nlanes);
8192   bitmap_clear (used_in_lanes);
8193   auto_bitmap used_defs;
8194
8195   unsigned int count = mask.encoded_nelts ();
8196   mask.quick_grow (count);
8197   vec_perm_indices indices;
8198
8199   for (unsigned int j = 0; j < nelts_to_build; j++)
8200     {
8201       unsigned int iter_num = j / group_size;
8202       unsigned int stmt_num = j % group_size;
8203       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8204       bitmap_set_bit (used_in_lanes, i);
8205       if (repeating_p)
8206         {
8207           first_vec_index = 0;
8208           mask_element = i;
8209         }
8210       else
8211         {
8212           /* Enforced before the loop when !repeating_p.  */
8213           unsigned int const_nunits = nunits.to_constant ();
8214           vec_index = i / const_nunits;
8215           mask_element = i % const_nunits;
8216           if (vec_index == first_vec_index
8217               || first_vec_index == -1)
8218             {
8219               first_vec_index = vec_index;
8220             }
8221           else if (vec_index == second_vec_index
8222                    || second_vec_index == -1)
8223             {
8224               second_vec_index = vec_index;
8225               mask_element += const_nunits;
8226             }
8227           else
8228             {
8229               if (dump_p)
8230                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8231                                  "permutation requires at "
8232                                  "least three vectors %G",
8233                                  stmt_info->stmt);
8234               gcc_assert (analyze_only);
8235               return false;
8236             }
8237
8238           gcc_assert (mask_element < 2 * const_nunits);
8239         }
8240
8241       if (mask_element != index)
8242         noop_p = false;
8243       mask[index++] = mask_element;
8244
8245       if (index == count)
8246         {
8247           if (!noop_p)
8248             {
8249               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8250               if (!can_vec_perm_const_p (mode, mode, indices))
8251                 {
8252                   if (dump_p)
8253                     {
8254                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8255                                        "unsupported vect permute { ");
8256                       for (i = 0; i < count; ++i)
8257                         {
8258                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8259                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8260                         }
8261                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8262                     }
8263                   gcc_assert (analyze_only);
8264                   return false;
8265                 }
8266
8267               tree mask_vec = NULL_TREE;
8268               if (!analyze_only)
8269                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8270
8271               if (second_vec_index == -1)
8272                 second_vec_index = first_vec_index;
8273
8274               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8275                 {
8276                   ++*n_perms;
8277                   if (analyze_only)
8278                     continue;
8279                   /* Generate the permute statement if necessary.  */
8280                   tree first_vec = dr_chain[first_vec_index + ri];
8281                   tree second_vec = dr_chain[second_vec_index + ri];
8282                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8283                   tree perm_dest
8284                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8285                                                    vectype);
8286                   perm_dest = make_ssa_name (perm_dest);
8287                   gimple *perm_stmt
8288                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8289                                            second_vec, mask_vec);
8290                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8291                                                gsi);
8292                   if (dce_chain)
8293                     {
8294                       bitmap_set_bit (used_defs, first_vec_index + ri);
8295                       bitmap_set_bit (used_defs, second_vec_index + ri);
8296                     }
8297
8298                   /* Store the vector statement in NODE.  */
8299                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8300                 }
8301             }
8302           else if (!analyze_only)
8303             {
8304               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8305                 {
8306                   tree first_vec = dr_chain[first_vec_index + ri];
8307                   /* If mask was NULL_TREE generate the requested
8308                      identity transform.  */
8309                   gimple *perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8310                   if (dce_chain)
8311                     bitmap_set_bit (used_defs, first_vec_index + ri);
8312
8313                   /* Store the vector statement in NODE.  */
8314                   SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8315                 }
8316             }
8317
8318           index = 0;
8319           first_vec_index = -1;
8320           second_vec_index = -1;
8321           noop_p = true;
8322         }
8323     }
8324
8325   if (n_loads)
8326     {
8327       if (repeating_p)
8328         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8329       else
8330         {
8331           /* Enforced above when !repeating_p.  */
8332           unsigned int const_nunits = nunits.to_constant ();
8333           *n_loads = 0;
8334           bool load_seen = false;
8335           for (unsigned i = 0; i < in_nlanes; ++i)
8336             {
8337               if (i % const_nunits == 0)
8338                 {
8339                   if (load_seen)
8340                     *n_loads += 1;
8341                   load_seen = false;
8342                 }
8343               if (bitmap_bit_p (used_in_lanes, i))
8344                 load_seen = true;
8345             }
8346           if (load_seen)
8347             *n_loads += 1;
8348         }
8349     }
8350
8351   if (dce_chain)
8352     for (unsigned i = 0; i < dr_chain.length (); ++i)
8353       if (!bitmap_bit_p (used_defs, i))
8354         {
8355           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8356           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8357           gsi_remove (&rgsi, true);
8358           release_defs (stmt);
8359         }
8360
8361   return true;
8362 }
8363
8364 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8365    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8366    permute statements for the SLP node NODE.  Store the number of vector
8367    permute instructions in *N_PERMS and the number of vector load
8368    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8369    that were not needed.  */
8370
8371 bool
8372 vect_transform_slp_perm_load (vec_info *vinfo,
8373                               slp_tree node, const vec<tree> &dr_chain,
8374                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8375                               bool analyze_only, unsigned *n_perms,
8376                               unsigned int *n_loads, bool dce_chain)
8377 {
8378   return vect_transform_slp_perm_load_1 (vinfo, node,
8379                                          SLP_TREE_LOAD_PERMUTATION (node),
8380                                          dr_chain, gsi, vf, analyze_only,
8381                                          dump_enabled_p (), n_perms, n_loads,
8382                                          dce_chain);
8383 }
8384
8385 /* Produce the next vector result for SLP permutation NODE by adding a vector
8386    statement at GSI.  If MASK_VEC is nonnull, add:
8387
8388       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8389
8390    otherwise add:
8391
8392       <new SSA name> = FIRST_DEF.  */
8393
8394 static void
8395 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8396                           slp_tree node, tree first_def, tree second_def,
8397                           tree mask_vec)
8398 {
8399   tree vectype = SLP_TREE_VECTYPE (node);
8400
8401   /* ???  We SLP match existing vector element extracts but
8402      allow punning which we need to re-instantiate at uses
8403      but have no good way of explicitly representing.  */
8404   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8405       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8406     {
8407       gassign *conv_stmt
8408         = gimple_build_assign (make_ssa_name (vectype),
8409                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8410       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8411       first_def = gimple_assign_lhs (conv_stmt);
8412     }
8413   gassign *perm_stmt;
8414   tree perm_dest = make_ssa_name (vectype);
8415   if (mask_vec)
8416     {
8417       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8418                            TYPE_SIZE (vectype))
8419           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8420         {
8421           gassign *conv_stmt
8422             = gimple_build_assign (make_ssa_name (vectype),
8423                                    build1 (VIEW_CONVERT_EXPR,
8424                                            vectype, second_def));
8425           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8426           second_def = gimple_assign_lhs (conv_stmt);
8427         }
8428       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8429                                        first_def, second_def,
8430                                        mask_vec);
8431     }
8432   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8433     {
8434       /* For identity permutes we still need to handle the case
8435          of lowpart extracts or concats.  */
8436       unsigned HOST_WIDE_INT c;
8437       auto first_def_nunits
8438         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8439       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8440         {
8441           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8442                                  TYPE_SIZE (vectype), bitsize_zero_node);
8443           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8444         }
8445       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8446                                     first_def_nunits, &c) && c == 2)
8447         {
8448           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8449                                             NULL_TREE, second_def);
8450           perm_stmt = gimple_build_assign (perm_dest, ctor);
8451         }
8452       else
8453         gcc_unreachable ();
8454     }
8455   else
8456     {
8457       /* We need a copy here in case the def was external.  */
8458       perm_stmt = gimple_build_assign (perm_dest, first_def);
8459     }
8460   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8461   /* Store the vector statement in NODE.  */
8462   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8463 }
8464
8465 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8466    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8467    If GSI is nonnull, emit the permutation there.
8468
8469    When GSI is null, the only purpose of NODE is to give properties
8470    of the result, such as the vector type and number of SLP lanes.
8471    The node does not need to be a VEC_PERM_EXPR.
8472
8473    If the target supports the operation, return the number of individual
8474    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8475    dump file if DUMP_P is true.  */
8476
8477 static int
8478 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8479                                 slp_tree node, lane_permutation_t &perm,
8480                                 vec<slp_tree> &children, bool dump_p)
8481 {
8482   tree vectype = SLP_TREE_VECTYPE (node);
8483
8484   /* ???  We currently only support all same vector input types
8485      while the SLP IL should really do a concat + select and thus accept
8486      arbitrary mismatches.  */
8487   slp_tree child;
8488   unsigned i;
8489   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8490   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8491   tree op_vectype = NULL_TREE;
8492   FOR_EACH_VEC_ELT (children, i, child)
8493     if (SLP_TREE_VECTYPE (child))
8494       {
8495         op_vectype = SLP_TREE_VECTYPE (child);
8496         break;
8497       }
8498   if (!op_vectype)
8499     op_vectype = vectype;
8500   FOR_EACH_VEC_ELT (children, i, child)
8501     {
8502       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8503            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8504           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8505           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8506         {
8507           if (dump_p)
8508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8509                              "Unsupported vector types in lane permutation\n");
8510           return -1;
8511         }
8512       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8513         repeating_p = false;
8514     }
8515
8516   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8517   if (dump_p)
8518     {
8519       dump_printf_loc (MSG_NOTE, vect_location,
8520                        "vectorizing permutation");
8521       for (unsigned i = 0; i < perm.length (); ++i)
8522         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8523       if (repeating_p)
8524         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8525       dump_printf (MSG_NOTE, "\n");
8526     }
8527
8528   /* REPEATING_P is true if every output vector is guaranteed to use the
8529      same permute vector.  We can handle that case for both variable-length
8530      and constant-length vectors, but we only handle other cases for
8531      constant-length vectors.
8532
8533      Set:
8534
8535      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8536        mask vector that we want to build.
8537
8538      - NCOPIES to the number of copies of PERM that we need in order
8539        to build the necessary permute mask vectors.
8540
8541      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8542        for each permute mask vector.  This is only relevant when GSI is
8543        nonnull.  */
8544   uint64_t npatterns;
8545   unsigned nelts_per_pattern;
8546   uint64_t ncopies;
8547   unsigned noutputs_per_mask;
8548   if (repeating_p)
8549     {
8550       /* We need a single permute mask vector that has the form:
8551
8552            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8553
8554          In other words, the original n-element permute in PERM is
8555          "unrolled" to fill a full vector.  The stepped vector encoding
8556          that we use for permutes requires 3n elements.  */
8557       npatterns = SLP_TREE_LANES (node);
8558       nelts_per_pattern = ncopies = 3;
8559       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8560     }
8561   else
8562     {
8563       /* Calculate every element of every permute mask vector explicitly,
8564          instead of relying on the pattern described above.  */
8565       if (!nunits.is_constant (&npatterns))
8566         return -1;
8567       nelts_per_pattern = ncopies = 1;
8568       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8569         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8570           return -1;
8571       noutputs_per_mask = 1;
8572     }
8573   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8574   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8575
8576   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8577      from the { SLP operand, scalar lane } permutation as recorded in the
8578      SLP node as intermediate step.  This part should already work
8579      with SLP children with arbitrary number of lanes.  */
8580   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8581   auto_vec<unsigned> active_lane;
8582   vperm.create (olanes);
8583   active_lane.safe_grow_cleared (children.length (), true);
8584   for (unsigned i = 0; i < ncopies; ++i)
8585     {
8586       for (unsigned pi = 0; pi < perm.length (); ++pi)
8587         {
8588           std::pair<unsigned, unsigned> p = perm[pi];
8589           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8590           if (repeating_p)
8591             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8592           else
8593             {
8594               /* We checked above that the vectors are constant-length.  */
8595               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8596               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8597               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8598               vperm.quick_push ({{p.first, vi}, vl});
8599             }
8600         }
8601       /* Advance to the next group.  */
8602       for (unsigned j = 0; j < children.length (); ++j)
8603         active_lane[j] += SLP_TREE_LANES (children[j]);
8604     }
8605
8606   if (dump_p)
8607     {
8608       dump_printf_loc (MSG_NOTE, vect_location,
8609                        "vectorizing permutation");
8610       for (unsigned i = 0; i < perm.length (); ++i)
8611         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8612       if (repeating_p)
8613         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8614       dump_printf (MSG_NOTE, "\n");
8615       dump_printf_loc (MSG_NOTE, vect_location, "as");
8616       for (unsigned i = 0; i < vperm.length (); ++i)
8617         {
8618           if (i != 0
8619               && (repeating_p
8620                   ? multiple_p (i, npatterns)
8621                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8622             dump_printf (MSG_NOTE, ",");
8623           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8624                        vperm[i].first.first, vperm[i].first.second,
8625                        vperm[i].second);
8626         }
8627       dump_printf (MSG_NOTE, "\n");
8628     }
8629
8630   /* We can only handle two-vector permutes, everything else should
8631      be lowered on the SLP level.  The following is closely inspired
8632      by vect_transform_slp_perm_load and is supposed to eventually
8633      replace it.
8634      ???   As intermediate step do code-gen in the SLP tree representation
8635      somehow?  */
8636   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8637   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8638   unsigned int index = 0;
8639   poly_uint64 mask_element;
8640   vec_perm_builder mask;
8641   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8642   unsigned int count = mask.encoded_nelts ();
8643   mask.quick_grow (count);
8644   vec_perm_indices indices;
8645   unsigned nperms = 0;
8646   for (unsigned i = 0; i < vperm.length (); ++i)
8647     {
8648       mask_element = vperm[i].second;
8649       if (first_vec.first == -1U
8650           || first_vec == vperm[i].first)
8651         first_vec = vperm[i].first;
8652       else if (second_vec.first == -1U
8653                || second_vec == vperm[i].first)
8654         {
8655           second_vec = vperm[i].first;
8656           mask_element += nunits;
8657         }
8658       else
8659         {
8660           if (dump_p)
8661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8662                              "permutation requires at "
8663                              "least three vectors\n");
8664           gcc_assert (!gsi);
8665           return -1;
8666         }
8667
8668       mask[index++] = mask_element;
8669
8670       if (index == count)
8671         {
8672           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8673                               TYPE_VECTOR_SUBPARTS (op_vectype));
8674           bool identity_p = indices.series_p (0, 1, 0, 1);
8675           machine_mode vmode = TYPE_MODE (vectype);
8676           machine_mode op_vmode = TYPE_MODE (op_vectype);
8677           unsigned HOST_WIDE_INT c;
8678           if ((!identity_p
8679                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8680               || (identity_p
8681                   && !known_le (nunits,
8682                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8683                   && (!constant_multiple_p (nunits,
8684                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8685                                             &c) || c != 2)))
8686             {
8687               if (dump_p)
8688                 {
8689                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8690                                    vect_location,
8691                                    "unsupported vect permute { ");
8692                   for (i = 0; i < count; ++i)
8693                     {
8694                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8695                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8696                     }
8697                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8698                 }
8699               gcc_assert (!gsi);
8700               return -1;
8701             }
8702
8703           if (!identity_p)
8704             nperms++;
8705           if (gsi)
8706             {
8707               if (second_vec.first == -1U)
8708                 second_vec = first_vec;
8709
8710               slp_tree
8711                 first_node = children[first_vec.first],
8712                 second_node = children[second_vec.first];
8713
8714               tree mask_vec = NULL_TREE;
8715               if (!identity_p)
8716                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8717
8718               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8719                 {
8720                   tree first_def
8721                     = vect_get_slp_vect_def (first_node,
8722                                              first_vec.second + vi);
8723                   tree second_def
8724                     = vect_get_slp_vect_def (second_node,
8725                                              second_vec.second + vi);
8726                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8727                                             second_def, mask_vec);
8728                 }
8729             }
8730
8731           index = 0;
8732           first_vec = std::make_pair (-1U, -1U);
8733           second_vec = std::make_pair (-1U, -1U);
8734         }
8735     }
8736
8737   return nperms;
8738 }
8739
8740 /* Vectorize the SLP permutations in NODE as specified
8741    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8742    child number and lane number.
8743    Interleaving of two two-lane two-child SLP subtrees (not supported):
8744      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8745    A blend of two four-lane two-child SLP subtrees:
8746      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8747    Highpart of a four-lane one-child SLP subtree (not supported):
8748      [ { 0, 2 }, { 0, 3 } ]
8749    Where currently only a subset is supported by code generating below.  */
8750
8751 static bool
8752 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8753                               slp_tree node, stmt_vector_for_cost *cost_vec)
8754 {
8755   tree vectype = SLP_TREE_VECTYPE (node);
8756   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8757   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8758                                                SLP_TREE_CHILDREN (node),
8759                                                dump_enabled_p ());
8760   if (nperms < 0)
8761     return false;
8762
8763   if (!gsi)
8764     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8765
8766   return true;
8767 }
8768
8769 /* Vectorize SLP NODE.  */
8770
8771 static void
8772 vect_schedule_slp_node (vec_info *vinfo,
8773                         slp_tree node, slp_instance instance)
8774 {
8775   gimple_stmt_iterator si;
8776   int i;
8777   slp_tree child;
8778
8779   /* For existing vectors there's nothing to do.  */
8780   if (SLP_TREE_VEC_DEFS (node).exists ())
8781     return;
8782
8783   gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8784
8785   /* Vectorize externals and constants.  */
8786   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8787       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8788     {
8789       /* ???  vectorizable_shift can end up using a scalar operand which is
8790          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8791          node in this case.  */
8792       if (!SLP_TREE_VECTYPE (node))
8793         return;
8794
8795       vect_create_constant_vectors (vinfo, node);
8796       return;
8797     }
8798
8799   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8800
8801   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8802   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8803
8804   if (dump_enabled_p ())
8805     dump_printf_loc (MSG_NOTE, vect_location,
8806                      "------>vectorizing SLP node starting from: %G",
8807                      stmt_info->stmt);
8808
8809   if (STMT_VINFO_DATA_REF (stmt_info)
8810       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8811     {
8812       /* Vectorized loads go before the first scalar load to make it
8813          ready early, vectorized stores go before the last scalar
8814          stmt which is where all uses are ready.  */
8815       stmt_vec_info last_stmt_info = NULL;
8816       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8817         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8818       else /* DR_IS_WRITE */
8819         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8820       si = gsi_for_stmt (last_stmt_info->stmt);
8821     }
8822   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8823             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8824             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8825            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8826     {
8827       /* For PHI node vectorization we do not use the insertion iterator.  */
8828       si = gsi_none ();
8829     }
8830   else
8831     {
8832       /* Emit other stmts after the children vectorized defs which is
8833          earliest possible.  */
8834       gimple *last_stmt = NULL;
8835       bool seen_vector_def = false;
8836       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8837         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8838           {
8839             /* For fold-left reductions we are retaining the scalar
8840                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8841                set so the representation isn't perfect.  Resort to the
8842                last scalar def here.  */
8843             if (SLP_TREE_VEC_STMTS (child).is_empty ())
8844               {
8845                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8846                             == cycle_phi_info_type);
8847                 gphi *phi = as_a <gphi *>
8848                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8849                 if (!last_stmt
8850                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8851                   last_stmt = phi;
8852               }
8853             /* We are emitting all vectorized stmts in the same place and
8854                the last one is the last.
8855                ???  Unless we have a load permutation applied and that
8856                figures to re-use an earlier generated load.  */
8857             unsigned j;
8858             gimple *vstmt;
8859             FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8860               if (!last_stmt
8861                   || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8862                 last_stmt = vstmt;
8863           }
8864         else if (!SLP_TREE_VECTYPE (child))
8865           {
8866             /* For externals we use unvectorized at all scalar defs.  */
8867             unsigned j;
8868             tree def;
8869             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8870               if (TREE_CODE (def) == SSA_NAME
8871                   && !SSA_NAME_IS_DEFAULT_DEF (def))
8872                 {
8873                   gimple *stmt = SSA_NAME_DEF_STMT (def);
8874                   if (!last_stmt
8875                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8876                     last_stmt = stmt;
8877                 }
8878           }
8879         else
8880           {
8881             /* For externals we have to look at all defs since their
8882                insertion place is decided per vector.  But beware
8883                of pre-existing vectors where we need to make sure
8884                we do not insert before the region boundary.  */
8885             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8886                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8887               seen_vector_def = true;
8888             else
8889               {
8890                 unsigned j;
8891                 tree vdef;
8892                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8893                   if (TREE_CODE (vdef) == SSA_NAME
8894                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8895                     {
8896                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8897                       if (!last_stmt
8898                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8899                         last_stmt = vstmt;
8900                     }
8901               }
8902           }
8903       /* This can happen when all children are pre-existing vectors or
8904          constants.  */
8905       if (!last_stmt)
8906         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8907       if (!last_stmt)
8908         {
8909           gcc_assert (seen_vector_def);
8910           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8911         }
8912       else if (is_ctrl_altering_stmt (last_stmt))
8913         {
8914           /* We split regions to vectorize at control altering stmts
8915              with a definition so this must be an external which
8916              we can insert at the start of the region.  */
8917           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8918         }
8919       else if (is_a <bb_vec_info> (vinfo)
8920                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8921                && gimple_could_trap_p (stmt_info->stmt))
8922         {
8923           /* We've constrained possibly trapping operations to all come
8924              from the same basic-block, if vectorized defs would allow earlier
8925              scheduling still force vectorized stmts to the original block.
8926              This is only necessary for BB vectorization since for loop vect
8927              all operations are in a single BB and scalar stmt based
8928              placement doesn't play well with epilogue vectorization.  */
8929           gcc_assert (dominated_by_p (CDI_DOMINATORS,
8930                                       gimple_bb (stmt_info->stmt),
8931                                       gimple_bb (last_stmt)));
8932           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8933         }
8934       else if (is_a <gphi *> (last_stmt))
8935         si = gsi_after_labels (gimple_bb (last_stmt));
8936       else
8937         {
8938           si = gsi_for_stmt (last_stmt);
8939           gsi_next (&si);
8940         }
8941     }
8942
8943   /* Handle purely internal nodes.  */
8944   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8945     {
8946       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
8947          be shared with different SLP nodes (but usually it's the same
8948          operation apart from the case the stmt is only there for denoting
8949          the actual scalar lane defs ...).  So do not call vect_transform_stmt
8950          but open-code it here (partly).  */
8951       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8952       gcc_assert (done);
8953       stmt_vec_info slp_stmt_info;
8954       unsigned int i;
8955       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8956         if (STMT_VINFO_LIVE_P (slp_stmt_info))
8957           {
8958             done = vectorizable_live_operation (vinfo,
8959                                                 slp_stmt_info, &si, node,
8960                                                 instance, i, true, NULL);
8961             gcc_assert (done);
8962           }
8963     }
8964   else
8965     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8966 }
8967
8968 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8969    For loop vectorization this is done in vectorizable_call, but for SLP
8970    it needs to be deferred until end of vect_schedule_slp, because multiple
8971    SLP instances may refer to the same scalar stmt.  */
8972
8973 static void
8974 vect_remove_slp_scalar_calls (vec_info *vinfo,
8975                               slp_tree node, hash_set<slp_tree> &visited)
8976 {
8977   gimple *new_stmt;
8978   gimple_stmt_iterator gsi;
8979   int i;
8980   slp_tree child;
8981   tree lhs;
8982   stmt_vec_info stmt_info;
8983
8984   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8985     return;
8986
8987   if (visited.add (node))
8988     return;
8989
8990   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8991     vect_remove_slp_scalar_calls (vinfo, child, visited);
8992
8993   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8994     {
8995       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8996       if (!stmt || gimple_bb (stmt) == NULL)
8997         continue;
8998       if (is_pattern_stmt_p (stmt_info)
8999           || !PURE_SLP_STMT (stmt_info))
9000         continue;
9001       lhs = gimple_call_lhs (stmt);
9002       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9003       gsi = gsi_for_stmt (stmt);
9004       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9005       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
9006     }
9007 }
9008
9009 static void
9010 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9011 {
9012   hash_set<slp_tree> visited;
9013   vect_remove_slp_scalar_calls (vinfo, node, visited);
9014 }
9015
9016 /* Vectorize the instance root.  */
9017
9018 void
9019 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9020 {
9021   gassign *rstmt = NULL;
9022
9023   if (instance->kind == slp_inst_kind_ctor)
9024     {
9025       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9026         {
9027           gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
9028           tree vect_lhs = gimple_get_lhs (child_stmt);
9029           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9030           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9031                                           TREE_TYPE (vect_lhs)))
9032             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9033                                vect_lhs);
9034           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9035         }
9036       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9037         {
9038           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9039           gimple *child_stmt;
9040           int j;
9041           vec<constructor_elt, va_gc> *v;
9042           vec_alloc (v, nelts);
9043
9044           /* A CTOR can handle V16HI composition from VNx8HI so we
9045              do not need to convert vector elements if the types
9046              do not match.  */
9047           FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
9048             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
9049                                     gimple_get_lhs (child_stmt));
9050           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9051           tree rtype
9052             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9053           tree r_constructor = build_constructor (rtype, v);
9054           rstmt = gimple_build_assign (lhs, r_constructor);
9055         }
9056     }
9057   else if (instance->kind == slp_inst_kind_bb_reduc)
9058     {
9059       /* Largely inspired by reduction chain epilogue handling in
9060          vect_create_epilog_for_reduction.  */
9061       vec<tree> vec_defs = vNULL;
9062       vect_get_slp_defs (node, &vec_defs);
9063       enum tree_code reduc_code
9064         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9065       /* ???  We actually have to reflect signs somewhere.  */
9066       if (reduc_code == MINUS_EXPR)
9067         reduc_code = PLUS_EXPR;
9068       gimple_seq epilogue = NULL;
9069       /* We may end up with more than one vector result, reduce them
9070          to one vector.  */
9071       tree vec_def = vec_defs[0];
9072       for (unsigned i = 1; i < vec_defs.length (); ++i)
9073         vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9074                                 vec_def, vec_defs[i]);
9075       vec_defs.release ();
9076       /* ???  Support other schemes than direct internal fn.  */
9077       internal_fn reduc_fn;
9078       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9079           || reduc_fn == IFN_LAST)
9080         gcc_unreachable ();
9081       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9082                                       TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9083
9084       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9085       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9086       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9087       update_stmt (gsi_stmt (rgsi));
9088       return;
9089     }
9090   else
9091     gcc_unreachable ();
9092
9093   gcc_assert (rstmt);
9094
9095   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9096   gsi_replace (&rgsi, rstmt, true);
9097 }
9098
9099 struct slp_scc_info
9100 {
9101   bool on_stack;
9102   int dfs;
9103   int lowlink;
9104 };
9105
9106 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9107
9108 static void
9109 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9110                    hash_map<slp_tree, slp_scc_info> &scc_info,
9111                    int &maxdfs, vec<slp_tree> &stack)
9112 {
9113   bool existed_p;
9114   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9115   gcc_assert (!existed_p);
9116   info->dfs = maxdfs;
9117   info->lowlink = maxdfs;
9118   maxdfs++;
9119
9120   /* Leaf.  */
9121   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9122     {
9123       info->on_stack = false;
9124       vect_schedule_slp_node (vinfo, node, instance);
9125       return;
9126     }
9127
9128   info->on_stack = true;
9129   stack.safe_push (node);
9130
9131   unsigned i;
9132   slp_tree child;
9133   /* DFS recurse.  */
9134   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9135     {
9136       if (!child)
9137         continue;
9138       slp_scc_info *child_info = scc_info.get (child);
9139       if (!child_info)
9140         {
9141           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9142           /* Recursion might have re-allocated the node.  */
9143           info = scc_info.get (node);
9144           child_info = scc_info.get (child);
9145           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9146         }
9147       else if (child_info->on_stack)
9148         info->lowlink = MIN (info->lowlink, child_info->dfs);
9149     }
9150   if (info->lowlink != info->dfs)
9151     return;
9152
9153   auto_vec<slp_tree, 4> phis_to_fixup;
9154
9155   /* Singleton.  */
9156   if (stack.last () == node)
9157     {
9158       stack.pop ();
9159       info->on_stack = false;
9160       vect_schedule_slp_node (vinfo, node, instance);
9161       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9162           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9163         phis_to_fixup.quick_push (node);
9164     }
9165   else
9166     {
9167       /* SCC.  */
9168       int last_idx = stack.length () - 1;
9169       while (stack[last_idx] != node)
9170         last_idx--;
9171       /* We can break the cycle at PHIs who have at least one child
9172          code generated.  Then we could re-start the DFS walk until
9173          all nodes in the SCC are covered (we might have new entries
9174          for only back-reachable nodes).  But it's simpler to just
9175          iterate and schedule those that are ready.  */
9176       unsigned todo = stack.length () - last_idx;
9177       do
9178         {
9179           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9180             {
9181               slp_tree entry = stack[idx];
9182               if (!entry)
9183                 continue;
9184               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9185                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9186               bool ready = !phi;
9187               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9188                   if (!child)
9189                     {
9190                       gcc_assert (phi);
9191                       ready = true;
9192                       break;
9193                     }
9194                   else if (scc_info.get (child)->on_stack)
9195                     {
9196                       if (!phi)
9197                         {
9198                           ready = false;
9199                           break;
9200                         }
9201                     }
9202                   else
9203                     {
9204                       if (phi)
9205                         {
9206                           ready = true;
9207                           break;
9208                         }
9209                     }
9210               if (ready)
9211                 {
9212                   vect_schedule_slp_node (vinfo, entry, instance);
9213                   scc_info.get (entry)->on_stack = false;
9214                   stack[idx] = NULL;
9215                   todo--;
9216                   if (phi)
9217                     phis_to_fixup.safe_push (entry);
9218                 }
9219             }
9220         }
9221       while (todo != 0);
9222
9223       /* Pop the SCC.  */
9224       stack.truncate (last_idx);
9225     }
9226
9227   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9228   slp_tree phi_node;
9229   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9230     {
9231       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9232       edge_iterator ei;
9233       edge e;
9234       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9235         {
9236           unsigned dest_idx = e->dest_idx;
9237           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9238           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9239             continue;
9240           unsigned n = SLP_TREE_VEC_STMTS (phi_node).length ();
9241           /* Simply fill all args.  */
9242           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9243               != vect_first_order_recurrence)
9244             for (unsigned i = 0; i < n; ++i)
9245               add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9246                            vect_get_slp_vect_def (child, i),
9247                            e, gimple_phi_arg_location (phi, dest_idx));
9248           else
9249             {
9250               /* Unless it is a first order recurrence which needs
9251                  args filled in for both the PHI node and the permutes.  */
9252               gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[0];
9253               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9254               add_phi_arg (as_a <gphi *> (rphi),
9255                            vect_get_slp_vect_def (child, n - 1),
9256                            e, gimple_phi_arg_location (phi, dest_idx));
9257               for (unsigned i = 0; i < n; ++i)
9258                 {
9259                   gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[i];
9260                   if (i > 0)
9261                     gimple_assign_set_rhs1 (perm,
9262                                             vect_get_slp_vect_def (child, i - 1));
9263                   gimple_assign_set_rhs2 (perm,
9264                                           vect_get_slp_vect_def (child, i));
9265                   update_stmt (perm);
9266                 }
9267             }
9268         }
9269     }
9270 }
9271
9272 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9273
9274 void
9275 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9276 {
9277   slp_instance instance;
9278   unsigned int i;
9279
9280   hash_map<slp_tree, slp_scc_info> scc_info;
9281   int maxdfs = 0;
9282   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9283     {
9284       slp_tree node = SLP_INSTANCE_TREE (instance);
9285       if (dump_enabled_p ())
9286         {
9287           dump_printf_loc (MSG_NOTE, vect_location,
9288                            "Vectorizing SLP tree:\n");
9289           /* ???  Dump all?  */
9290           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9291             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9292                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9293           vect_print_slp_graph (MSG_NOTE, vect_location,
9294                                 SLP_INSTANCE_TREE (instance));
9295         }
9296       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9297          have a PHI be the node breaking the cycle.  */
9298       auto_vec<slp_tree> stack;
9299       if (!scc_info.get (node))
9300         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9301
9302       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9303         vectorize_slp_instance_root_stmt (node, instance);
9304
9305       if (dump_enabled_p ())
9306         dump_printf_loc (MSG_NOTE, vect_location,
9307                          "vectorizing stmts using SLP.\n");
9308     }
9309
9310   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9311     {
9312       slp_tree root = SLP_INSTANCE_TREE (instance);
9313       stmt_vec_info store_info;
9314       unsigned int j;
9315
9316       /* Remove scalar call stmts.  Do not do this for basic-block
9317          vectorization as not all uses may be vectorized.
9318          ???  Why should this be necessary?  DCE should be able to
9319          remove the stmts itself.
9320          ???  For BB vectorization we can as well remove scalar
9321          stmts starting from the SLP tree root if they have no
9322          uses.  */
9323       if (is_a <loop_vec_info> (vinfo))
9324         vect_remove_slp_scalar_calls (vinfo, root);
9325
9326       /* Remove vectorized stores original scalar stmts.  */
9327       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9328         {
9329           if (!STMT_VINFO_DATA_REF (store_info)
9330               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9331             break;
9332
9333           store_info = vect_orig_stmt (store_info);
9334           /* Free the attached stmt_vec_info and remove the stmt.  */
9335           vinfo->remove_stmt (store_info);
9336
9337           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9338              to not crash in vect_free_slp_tree later.  */
9339           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9340             SLP_TREE_REPRESENTATIVE (root) = NULL;
9341         }
9342     }
9343 }