gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 286      emitted at the position of the first scalar load.
 287      Stores in a group are emitted at the position of the last scalar store.
 288      Compute that position and check whether the resulting order matches
 289      the current one.  */
 290   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 291   if (il_a)
 292     {
 293       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 294         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 295              s = DR_GROUP_NEXT_ELEMENT (s))
 296           il_a = get_later_stmt (il_a, s);
 297       else /* DR_IS_READ */
 298         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 299              s = DR_GROUP_NEXT_ELEMENT (s))
 300           if (get_later_stmt (il_a, s) == il_a)
 301             il_a = s;
 302     }
 303   else
 304     il_a = stmtinfo_a;
 305   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 306   if (il_b)
 307     {
 308       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 309         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 310              s = DR_GROUP_NEXT_ELEMENT (s))
 311           il_b = get_later_stmt (il_b, s);
 312       else /* DR_IS_READ */
 313         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 314              s = DR_GROUP_NEXT_ELEMENT (s))
 315           if (get_later_stmt (il_b, s) == il_b)
 316             il_b = s;
 317     }
 318   else
 319     il_b = stmtinfo_b;
 320   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 321   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 322 }
 323
 324 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 325    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 326    distances.  These distances are conservatively correct but they don't
 327    reflect a guaranteed dependence.
 328
 329    Return true if this function does all the work necessary to avoid
 330    an alias or false if the caller should use the dependence distances
 331    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 332    the depth of the loop described by LOOP_VINFO and the other arguments
 333    are as for vect_analyze_data_ref_dependence.  */
 334
 335 static bool
 336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 337                                        loop_vec_info loop_vinfo,
 338                                        int loop_depth, unsigned int *max_vf)
 339 {
 340   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 341   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 342     {
 343       int dist = dist_v[loop_depth];
 344       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 345         {
 346           /* If the user asserted safelen >= DIST consecutive iterations
 347              can be executed concurrently, assume independence.
 348
 349              ??? An alternative would be to add the alias check even
 350              in this case, and vectorize the fallback loop with the
 351              maximum VF set to safelen.  However, if the user has
 352              explicitly given a length, it's less likely that that
 353              would be a win.  */
 354           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 355             {
 356               if ((unsigned int) loop->safelen < *max_vf)
 357                 *max_vf = loop->safelen;
 358               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 359               continue;
 360             }
 361
 362           /* For dependence distances of 2 or more, we have the option
 363              of limiting VF or checking for an alias at runtime.
 364              Prefer to check at runtime if we can, to avoid limiting
 365              the VF unnecessarily when the bases are in fact independent.
 366
 367              Note that the alias checks will be removed if the VF ends up
 368              being small enough.  */
 369           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 370           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 371           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 372                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 373                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 374         }
 375     }
 376   return true;
 377 }
 378
 379
 380 /* Function vect_analyze_data_ref_dependence.
 381
 382    FIXME: I needed to change the sense of the returned flag.
 383
 384    Return FALSE if there (might) exist a dependence between a memory-reference
 385    DRA and a memory-reference DRB.  When versioning for alias may check a
 386    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 387    the data dependence.  */
 388
 389 static opt_result
 390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 391                                   loop_vec_info loop_vinfo,
 392                                   unsigned int *max_vf)
 393 {
 394   unsigned int i;
 395   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 396   struct data_reference *dra = DDR_A (ddr);
 397   struct data_reference *drb = DDR_B (ddr);
 398   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 399   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 400   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 401   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 402   lambda_vector dist_v;
 403   unsigned int loop_depth;
 404
 405   /* If user asserted safelen consecutive iterations can be
 406      executed concurrently, assume independence.  */
 407   auto apply_safelen = [&]()
 408     {
 409       if (loop->safelen >= 2)
 410         {
 411           if ((unsigned int) loop->safelen < *max_vf)
 412             *max_vf = loop->safelen;
 413           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 414           return true;
 415         }
 416       return false;
 417     };
 418
 419   /* In loop analysis all data references should be vectorizable.  */
 420   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 421       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 422     gcc_unreachable ();
 423
 424   /* Independent data accesses.  */
 425   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 426     return opt_result::success ();
 427
 428   if (dra == drb
 429       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 430     return opt_result::success ();
 431
 432   /* We do not have to consider dependences between accesses that belong
 433      to the same group, unless the stride could be smaller than the
 434      group size.  */
 435   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 436       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 437           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 438       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 439     return opt_result::success ();
 440
 441   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 442      least two scalar iterations, there is always also a true dependence.
 443      As the vectorizer does not re-order loads and stores we can ignore
 444      the anti-dependence if TBAA can disambiguate both DRs similar to the
 445      case with known negative distance anti-dependences (positive
 446      distance anti-dependences would violate TBAA constraints).  */
 447   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 448        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 449       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 450                                  get_alias_set (DR_REF (drb))))
 451     return opt_result::success ();
 452
 453   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455     {
 456       if (apply_safelen ())
 457         return opt_result::success ();
 458
 459       return opt_result::failure_at
 460         (stmtinfo_a->stmt,
 461          "possible alias involving gather/scatter between %T and %T\n",
 462          DR_REF (dra), DR_REF (drb));
 463     }
 464
 465   /* Unknown data dependence.  */
 466   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 467     {
 468       if (apply_safelen ())
 469         return opt_result::success ();
 470
 471       if (dump_enabled_p ())
 472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 473                          "versioning for alias required: "
 474                          "can't determine dependence between %T and %T\n",
 475                          DR_REF (dra), DR_REF (drb));
 476
 477       /* Add to list of ddrs that need to be tested at run-time.  */
 478       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 479     }
 480
 481   /* Known data dependence.  */
 482   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 483     {
 484       if (apply_safelen ())
 485         return opt_result::success ();
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 489                          "versioning for alias required: "
 490                          "bad dist vector for %T and %T\n",
 491                          DR_REF (dra), DR_REF (drb));
 492       /* Add to list of ddrs that need to be tested at run-time.  */
 493       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 494     }
 495
 496   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 497
 498   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 499       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 500                                                 loop_depth, max_vf))
 501     return opt_result::success ();
 502
 503   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 504     {
 505       int dist = dist_v[loop_depth];
 506
 507       if (dump_enabled_p ())
 508         dump_printf_loc (MSG_NOTE, vect_location,
 509                          "dependence distance  = %d.\n", dist);
 510
 511       if (dist == 0)
 512         {
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance == 0 between %T and %T\n",
 516                              DR_REF (dra), DR_REF (drb));
 517
 518           /* When we perform grouped accesses and perform implicit CSE
 519              by detecting equal accesses and doing disambiguation with
 520              runtime alias tests like for
 521                 .. = a[i];
 522                 .. = a[i+1];
 523                 a[i] = ..;
 524                 a[i+1] = ..;
 525                 *p = ..;
 526                 .. = a[i];
 527                 .. = a[i+1];
 528              where we will end up loading { a[i], a[i+1] } once, make
 529              sure that inserting group loads before the first load and
 530              stores after the last store will do the right thing.
 531              Similar for groups like
 532                 a[i] = ...;
 533                 ... = a[i];
 534                 a[i+1] = ...;
 535              where loads from the group interleave with the store.  */
 536           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 537             return opt_result::failure_at (stmtinfo_a->stmt,
 538                                            "READ_WRITE dependence"
 539                                            " in interleaving.\n");
 540
 541           if (loop->safelen < 2)
 542             {
 543               tree indicator = dr_zero_step_indicator (dra);
 544               if (!indicator || integer_zerop (indicator))
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "access also has a zero step\n");
 547               else if (TREE_CODE (indicator) != INTEGER_CST)
 548                 vect_check_nonzero_value (loop_vinfo, indicator);
 549             }
 550           continue;
 551         }
 552
 553       if (dist > 0 && DDR_REVERSED_P (ddr))
 554         {
 555           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 556              reversed (to make distance vector positive), and the actual
 557              distance is negative.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance negative.\n");
 561           /* When doing outer loop vectorization, we need to check if there is
 562              a backward dependence at the inner loop level if the dependence
 563              at the outer loop is reversed.  See PR81740.  */
 564           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 565               || nested_in_vect_loop_p (loop, stmtinfo_b))
 566             {
 567               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 568                                                          DDR_LOOP_NEST (ddr));
 569               if (dist_v[inner_depth] < 0)
 570                 return opt_result::failure_at (stmtinfo_a->stmt,
 571                                                "not vectorized, dependence "
 572                                                "between data-refs %T and %T\n",
 573                                                DR_REF (dra), DR_REF (drb));
 574             }
 575           /* Record a negative dependence distance to later limit the
 576              amount of stmt copying / unrolling we can perform.
 577              Only need to handle read-after-write dependence.  */
 578           if (DR_IS_READ (drb)
 579               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 580                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 581             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 582           continue;
 583         }
 584
 585       unsigned int abs_dist = abs (dist);
 586       if (abs_dist >= 2 && abs_dist < *max_vf)
 587         {
 588           /* The dependence distance requires reduction of the maximal
 589              vectorization factor.  */
 590           *max_vf = abs_dist;
 591           if (dump_enabled_p ())
 592             dump_printf_loc (MSG_NOTE, vect_location,
 593                              "adjusting maximal vectorization factor to %i\n",
 594                              *max_vf);
 595         }
 596
 597       if (abs_dist >= *max_vf)
 598         {
 599           /* Dependence distance does not create dependence, as far as
 600              vectorization is concerned, in this case.  */
 601           if (dump_enabled_p ())
 602             dump_printf_loc (MSG_NOTE, vect_location,
 603                              "dependence distance >= VF.\n");
 604           continue;
 605         }
 606
 607       return opt_result::failure_at (stmtinfo_a->stmt,
 608                                      "not vectorized, possible dependence "
 609                                      "between data-refs %T and %T\n",
 610                                      DR_REF (dra), DR_REF (drb));
 611     }
 612
 613   return opt_result::success ();
 614 }
 615
 616 /* Funcion vect_analyze_early_break_dependences.
 617
 618    Examime all the data references in the loop and make sure that if we have
 619    mulitple exits that we are able to safely move stores such that they become
 620    safe for vectorization.  The function also calculates the place where to move
 621    the instructions to and computes what the new vUSE chain should be.
 622
 623    This works in tandem with the CFG that will be produced by
 624    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 625
 626    This function tries to validate whether an early break vectorization
 627    is possible for the current instruction sequence. Returns True i
 628    possible, otherwise False.
 629
 630    Requirements:
 631      - Any memory access must be to a fixed size buffer.
 632      - There must not be any loads and stores to the same object.
 633      - Multiple loads are allowed as long as they don't alias.
 634
 635    NOTE:
 636      This implemementation is very conservative. Any overlappig loads/stores
 637      that take place before the early break statement gets rejected aside from
 638      WAR dependencies.
 639
 640      i.e.:
 641
 642         a[i] = 8
 643         c = a[i]
 644         if (b[i])
 645           ...
 646
 647         is not allowed, but
 648
 649         c = a[i]
 650         a[i] = 8
 651         if (b[i])
 652           ...
 653
 654         is which is the common case.  */
 655
 656 static opt_result
 657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 658 {
 659   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 660
 661   /* List of all load data references found during traversal.  */
 662   auto_vec<data_reference *> bases;
 663   basic_block dest_bb = NULL;
 664
 665   hash_set <gimple *> visited;
 666   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 667   class loop *loop_nest = loop_outer (loop);
 668
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "loop contains multiple exits, analyzing"
 672                      " statement dependencies.\n");
 673
 674   /* Since we don't support general control flow, the location we'll move the
 675      side-effects to is always the latch connected exit.  When we support
 676      general control flow we can do better but for now this is fine.  */
 677   dest_bb = single_pred (loop->latch);
 678   basic_block bb = dest_bb;
 679
 680   do
 681     {
 682       /* If the destination block is also the header then we have nothing to do.  */
 683       if (!single_pred_p (bb))
 684         continue;
 685
 686       bb = single_pred (bb);
 687       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 688
 689       /* Now analyze all the remaining statements and try to determine which
 690          instructions are allowed/needed to be moved.  */
 691       while (!gsi_end_p (gsi))
 692         {
 693           gimple *stmt = gsi_stmt (gsi);
 694           gsi_prev (&gsi);
 695           if (!gimple_has_ops (stmt)
 696               || is_gimple_debug (stmt))
 697             continue;
 698
 699           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 700           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 701           if (!dr_ref)
 702             continue;
 703
 704           /* We currently only support statically allocated objects due to
 705              not having first-faulting loads support or peeling for
 706              alignment support.  Compute the size of the referenced object
 707              (it could be dynamically allocated).  */
 708           tree obj = DR_BASE_ADDRESS (dr_ref);
 709           if (!obj || TREE_CODE (obj) != ADDR_EXPR)
 710             {
 711               if (dump_enabled_p ())
 712                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 713                                  "early breaks only supported on statically"
 714                                  " allocated objects.\n");
 715               return opt_result::failure_at (stmt,
 716                                  "can't safely apply code motion to "
 717                                  "dependencies of %G to vectorize "
 718                                  "the early exit.\n", stmt);
 719             }
 720
 721           tree refop = TREE_OPERAND (obj, 0);
 722           tree refbase = get_base_address (refop);
 723           if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
 724               || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
 725             {
 726               if (dump_enabled_p ())
 727                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                                  "early breaks only supported on"
 729                                  " statically allocated objects.\n");
 730               return opt_result::failure_at (stmt,
 731                                  "can't safely apply code motion to "
 732                                  "dependencies of %G to vectorize "
 733                                  "the early exit.\n", stmt);
 734             }
 735
 736           /* Check if vector accesses to the object will be within bounds.
 737              must be a constant or assume loop will be versioned or niters
 738              bounded by VF so accesses are within range.  */
 739           if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
 740             {
 741               if (dump_enabled_p ())
 742                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 743                                  "early breaks not supported: vectorization "
 744                                  "would %s beyond size of obj.",
 745                                  DR_IS_READ (dr_ref) ? "read" : "write");
 746               return opt_result::failure_at (stmt,
 747                                  "can't safely apply code motion to "
 748                                  "dependencies of %G to vectorize "
 749                                  "the early exit.\n", stmt);
 750             }
 751
 752           if (DR_IS_READ (dr_ref))
 753             bases.safe_push (dr_ref);
 754           else if (DR_IS_WRITE (dr_ref))
 755             {
 756               /* We are moving writes down in the CFG.  To be sure that this
 757                  is valid after vectorization we have to check all the loads
 758                  we are sinking the stores past to see if any of them may
 759                  alias or are the same object.
 760
 761                  Same objects will not be an issue because unless the store
 762                  is marked volatile the value can be forwarded.  If the
 763                  store is marked volatile we don't vectorize the loop
 764                  anyway.
 765
 766                  That leaves the check for aliasing.  We don't really need
 767                  to care about the stores aliasing with each other since the
 768                  stores are moved in order so the effects are still observed
 769                  correctly.  This leaves the check for WAR dependencies
 770                  which we would be introducing here if the DR can alias.
 771                  The check is quadratic in loads/stores but I have not found
 772                  a better API to do this.  I believe all loads and stores
 773                  must be checked.  We also must check them when we
 774                  encountered the store, since we don't care about loads past
 775                  the store.  */
 776
 777               for (auto dr_read : bases)
 778                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 779                   {
 780                     if (dump_enabled_p ())
 781                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 782                                        vect_location,
 783                                        "early breaks not supported: "
 784                                        "overlapping loads and stores "
 785                                        "found before the break "
 786                                        "statement.\n");
 787
 788                     return opt_result::failure_at (stmt,
 789                              "can't safely apply code motion to dependencies"
 790                              " to vectorize the early exit. %G may alias with"
 791                              " %G\n", stmt, dr_read->stmt);
 792                   }
 793             }
 794
 795           if (gimple_vdef (stmt))
 796             {
 797               if (dump_enabled_p ())
 798                 dump_printf_loc (MSG_NOTE, vect_location,
 799                                  "==> recording stmt %G", stmt);
 800
 801               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 802             }
 803           else if (gimple_vuse (stmt))
 804             {
 805               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 806               if (dump_enabled_p ())
 807                 dump_printf_loc (MSG_NOTE, vect_location,
 808                                  "marked statement for vUSE update: %G", stmt);
 809             }
 810         }
 811     }
 812   while (bb != loop->header);
 813
 814   /* We don't allow outer -> inner loop transitions which should have been
 815      trapped already during loop form analysis.  */
 816   gcc_assert (dest_bb->loop_father == loop);
 817
 818   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 819
 820   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 821     {
 822       /* All uses shall be updated to that of the first load.  Entries are
 823          stored in reverse order.  */
 824       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 825       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 826         {
 827           if (dump_enabled_p ())
 828           dump_printf_loc (MSG_NOTE, vect_location,
 829                            "will update use: %T, mem_ref: %G", vuse, g);
 830         }
 831     }
 832
 833   if (dump_enabled_p ())
 834     dump_printf_loc (MSG_NOTE, vect_location,
 835                      "recorded statements to be moved to BB %d\n",
 836                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 837
 838   return opt_result::success ();
 839 }
 840
 841 /* Function vect_analyze_data_ref_dependences.
 842
 843    Examine all the data references in the loop, and make sure there do not
 844    exist any data dependences between them.  Set *MAX_VF according to
 845    the maximum vectorization factor the data dependences allow.  */
 846
 847 opt_result
 848 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 849                                    unsigned int *max_vf)
 850 {
 851   unsigned int i;
 852   struct data_dependence_relation *ddr;
 853
 854   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 855
 856   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 857     {
 858       LOOP_VINFO_DDRS (loop_vinfo)
 859         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 860                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 861       /* We do not need read-read dependences.  */
 862       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 863                                           &LOOP_VINFO_DDRS (loop_vinfo),
 864                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 865                                           false);
 866       gcc_assert (res);
 867     }
 868
 869   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 870
 871   /* For epilogues we either have no aliases or alias versioning
 872      was applied to original loop.  Therefore we may just get max_vf
 873      using VF of original loop.  */
 874   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 875     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 876   else
 877     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 878       {
 879         opt_result res
 880           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 881         if (!res)
 882           return res;
 883       }
 884
 885   /* If we have early break statements in the loop, check to see if they
 886      are of a form we can vectorizer.  */
 887   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 888     return vect_analyze_early_break_dependences (loop_vinfo);
 889
 890   return opt_result::success ();
 891 }
 892
 893
 894 /* Function vect_slp_analyze_data_ref_dependence.
 895
 896    Return TRUE if there (might) exist a dependence between a memory-reference
 897    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 898    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 899    according to the data dependence.  */
 900
 901 static bool
 902 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 903                                       struct data_dependence_relation *ddr)
 904 {
 905   struct data_reference *dra = DDR_A (ddr);
 906   struct data_reference *drb = DDR_B (ddr);
 907   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 908   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 909
 910   /* We need to check dependences of statements marked as unvectorizable
 911      as well, they still can prohibit vectorization.  */
 912
 913   /* Independent data accesses.  */
 914   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 915     return false;
 916
 917   if (dra == drb)
 918     return false;
 919
 920   /* Read-read is OK.  */
 921   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 922     return false;
 923
 924   /* If dra and drb are part of the same interleaving chain consider
 925      them independent.  */
 926   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 927       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 928           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 929     return false;
 930
 931   /* Unknown data dependence.  */
 932   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 933     {
 934       if  (dump_enabled_p ())
 935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 936                          "can't determine dependence between %T and %T\n",
 937                          DR_REF (dra), DR_REF (drb));
 938     }
 939   else if (dump_enabled_p ())
 940     dump_printf_loc (MSG_NOTE, vect_location,
 941                      "determined dependence between %T and %T\n",
 942                      DR_REF (dra), DR_REF (drb));
 943
 944   return true;
 945 }
 946
 947
 948 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 949
 950 static bool
 951 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 952 {
 953   /* This walks over all stmts involved in the SLP store done
 954      in NODE verifying we can sink them up to the last stmt in the
 955      group.  */
 956   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 957   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 958
 959   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 960     {
 961       stmt_vec_info access_info
 962         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 963       if (access_info == last_access_info)
 964         continue;
 965       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 966       ao_ref ref;
 967       bool ref_initialized_p = false;
 968       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 969            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 970         {
 971           gimple *stmt = gsi_stmt (gsi);
 972           if (! gimple_vuse (stmt))
 973             continue;
 974
 975           /* If we couldn't record a (single) data reference for this
 976              stmt we have to resort to the alias oracle.  */
 977           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 978           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 979           if (!dr_b)
 980             {
 981               /* We are moving a store - this means
 982                  we cannot use TBAA for disambiguation.  */
 983               if (!ref_initialized_p)
 984                 ao_ref_init (&ref, DR_REF (dr_a));
 985               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 986                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 987                 return false;
 988               continue;
 989             }
 990
 991           gcc_assert (!gimple_visited_p (stmt));
 992
 993           ddr_p ddr = initialize_data_dependence_relation (dr_a,
 994                                                            dr_b, vNULL);
 995           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 996           free_dependence_relation (ddr);
 997           if (dependent)
 998             return false;
 999         }
1000     }
1001   return true;
1002 }
1003
1004 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1005    contain the vector of scalar stores of this instance if we are
1006    disambiguating the loads.  */
1007
1008 static bool
1009 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1010                                    vec<stmt_vec_info> stores,
1011                                    stmt_vec_info last_store_info)
1012 {
1013   /* This walks over all stmts involved in the SLP load done
1014      in NODE verifying we can hoist them up to the first stmt in the
1015      group.  */
1016   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1017   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1018
1019   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1020     {
1021       stmt_vec_info access_info
1022         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1023       if (access_info == first_access_info)
1024         continue;
1025       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1026       ao_ref ref;
1027       bool ref_initialized_p = false;
1028       hash_set<stmt_vec_info> grp_visited;
1029       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1030            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1031         {
1032           gimple *stmt = gsi_stmt (gsi);
1033           if (! gimple_vdef (stmt))
1034             continue;
1035
1036           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1037
1038           /* If we run into a store of this same instance (we've just
1039              marked those) then delay dependence checking until we run
1040              into the last store because this is where it will have
1041              been sunk to (and we verified that we can do that already).  */
1042           if (gimple_visited_p (stmt))
1043             {
1044               if (stmt_info != last_store_info)
1045                 continue;
1046
1047               for (stmt_vec_info &store_info : stores)
1048                 {
1049                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1050                   ddr_p ddr = initialize_data_dependence_relation
1051                                 (dr_a, store_dr, vNULL);
1052                   bool dependent
1053                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1054                   free_dependence_relation (ddr);
1055                   if (dependent)
1056                     return false;
1057                 }
1058               continue;
1059             }
1060
1061           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1062             {
1063               /* We are hoisting a load - this means we can use TBAA for
1064                  disambiguation.  */
1065               if (!ref_initialized_p)
1066                 ao_ref_init (&ref, DR_REF (dr_a));
1067               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1068                 {
1069                   /* If we couldn't record a (single) data reference for this
1070                      stmt we have to give up now.  */
1071                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1072                   if (!dr_b)
1073                     return false;
1074                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1075                                                                    dr_b, vNULL);
1076                   bool dependent
1077                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1078                   free_dependence_relation (ddr);
1079                   if (dependent)
1080                     return false;
1081                 }
1082               /* No dependence.  */
1083               return true;
1084             };
1085           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1086             {
1087               /* When we run into a store group we have to honor
1088                  that earlier stores might be moved here.  We don't
1089                  know exactly which and where to since we lack a
1090                  back-mapping from DR to SLP node, so assume all
1091                  earlier stores are sunk here.  It's enough to
1092                  consider the last stmt of a group for this.
1093                  ???  Both this and the fact that we disregard that
1094                  the conflicting instance might be removed later
1095                  is overly conservative.  */
1096               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1097                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1098                      store_info != NULL;
1099                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1100                   if ((store_info == stmt_info
1101                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1102                       && !check_hoist (store_info))
1103                     return false;
1104             }
1105           else
1106             {
1107               if (!check_hoist (stmt_info))
1108                 return false;
1109             }
1110         }
1111     }
1112   return true;
1113 }
1114
1115
1116 /* Function vect_analyze_data_ref_dependences.
1117
1118    Examine all the data references in the basic-block, and make sure there
1119    do not exist any data dependences between them.  Set *MAX_VF according to
1120    the maximum vectorization factor the data dependences allow.  */
1121
1122 bool
1123 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1124 {
1125   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1126
1127   /* The stores of this instance are at the root of the SLP tree.  */
1128   slp_tree store = NULL;
1129   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1130     store = SLP_INSTANCE_TREE (instance);
1131
1132   /* Verify we can sink stores to the vectorized stmt insert location.  */
1133   stmt_vec_info last_store_info = NULL;
1134   if (store)
1135     {
1136       if (! vect_slp_analyze_store_dependences (vinfo, store))
1137         return false;
1138
1139       /* Mark stores in this instance and remember the last one.  */
1140       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1141       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1142         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1143     }
1144
1145   bool res = true;
1146
1147   /* Verify we can sink loads to the vectorized stmt insert location,
1148      special-casing stores of this instance.  */
1149   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1150     if (! vect_slp_analyze_load_dependences (vinfo, load,
1151                                              store
1152                                              ? SLP_TREE_SCALAR_STMTS (store)
1153                                              : vNULL, last_store_info))
1154       {
1155         res = false;
1156         break;
1157       }
1158
1159   /* Unset the visited flag.  */
1160   if (store)
1161     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1162       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1163
1164   return res;
1165 }
1166
1167 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1168    applied.  */
1169
1170 int
1171 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1172 {
1173   HOST_WIDE_INT diff = 0;
1174   /* Alignment is only analyzed for the first element of a DR group,
1175      use that but adjust misalignment by the offset of the access.  */
1176   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1177     {
1178       dr_vec_info *first_dr
1179         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1180       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1181          INTEGER_CSTs and the first element in the group has the lowest
1182          address.  */
1183       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1184               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1185       gcc_assert (diff >= 0);
1186       dr_info = first_dr;
1187     }
1188
1189   int misalign = dr_info->misalignment;
1190   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1191   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1192     return misalign;
1193
1194   /* If the access is only aligned for a vector type with smaller alignment
1195      requirement the access has unknown misalignment.  */
1196   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1197                 targetm.vectorize.preferred_vector_alignment (vectype)))
1198     return DR_MISALIGNMENT_UNKNOWN;
1199
1200   /* Apply the offset from the DR group start and the externally supplied
1201      offset which can for example result from a negative stride access.  */
1202   poly_int64 misalignment = misalign + diff + offset;
1203
1204   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1205      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1206   unsigned HOST_WIDE_INT target_alignment_c
1207     = dr_info->target_alignment.to_constant ();
1208   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1209     return DR_MISALIGNMENT_UNKNOWN;
1210   return misalign;
1211 }
1212
1213 /* Record the base alignment guarantee given by DRB, which occurs
1214    in STMT_INFO.  */
1215
1216 static void
1217 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1218                             innermost_loop_behavior *drb)
1219 {
1220   bool existed;
1221   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1222     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1223   if (!existed || entry.second->base_alignment < drb->base_alignment)
1224     {
1225       entry = std::make_pair (stmt_info, drb);
1226       if (dump_enabled_p ())
1227         dump_printf_loc (MSG_NOTE, vect_location,
1228                          "recording new base alignment for %T\n"
1229                          "  alignment:    %d\n"
1230                          "  misalignment: %d\n"
1231                          "  based on:     %G",
1232                          drb->base_address,
1233                          drb->base_alignment,
1234                          drb->base_misalignment,
1235                          stmt_info->stmt);
1236     }
1237 }
1238
1239 /* If the region we're going to vectorize is reached, all unconditional
1240    data references occur at least once.  We can therefore pool the base
1241    alignment guarantees from each unconditional reference.  Do this by
1242    going through all the data references in VINFO and checking whether
1243    the containing statement makes the reference unconditionally.  If so,
1244    record the alignment of the base address in VINFO so that it can be
1245    used for all other references with the same base.  */
1246
1247 void
1248 vect_record_base_alignments (vec_info *vinfo)
1249 {
1250   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1251   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1252   for (data_reference *dr : vinfo->shared->datarefs)
1253     {
1254       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1255       stmt_vec_info stmt_info = dr_info->stmt;
1256       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1257           && STMT_VINFO_VECTORIZABLE (stmt_info)
1258           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1259         {
1260           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1261
1262           /* If DR is nested in the loop that is being vectorized, we can also
1263              record the alignment of the base wrt the outer loop.  */
1264           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1265             vect_record_base_alignment
1266               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1267         }
1268     }
1269 }
1270
1271 /* Function vect_compute_data_ref_alignment
1272
1273    Compute the misalignment of the data reference DR_INFO when vectorizing
1274    with VECTYPE.
1275
1276    Output:
1277    1. initialized misalignment info for DR_INFO
1278
1279    FOR NOW: No analysis is actually performed. Misalignment is calculated
1280    only for trivial cases. TODO.  */
1281
1282 static void
1283 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1284                                  tree vectype)
1285 {
1286   stmt_vec_info stmt_info = dr_info->stmt;
1287   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1288   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1289   class loop *loop = NULL;
1290   tree ref = DR_REF (dr_info->dr);
1291
1292   if (dump_enabled_p ())
1293     dump_printf_loc (MSG_NOTE, vect_location,
1294                      "vect_compute_data_ref_alignment:\n");
1295
1296   if (loop_vinfo)
1297     loop = LOOP_VINFO_LOOP (loop_vinfo);
1298
1299   /* Initialize misalignment to unknown.  */
1300   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1301
1302   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1303     return;
1304
1305   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1306   bool step_preserves_misalignment_p;
1307
1308   poly_uint64 vector_alignment
1309     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1310                  BITS_PER_UNIT);
1311   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1312
1313   /* If the main loop has peeled for alignment we have no way of knowing
1314      whether the data accesses in the epilogues are aligned.  We can't at
1315      compile time answer the question whether we have entered the main loop or
1316      not.  Fixes PR 92351.  */
1317   if (loop_vinfo)
1318     {
1319       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1320       if (orig_loop_vinfo
1321           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1322         return;
1323     }
1324
1325   unsigned HOST_WIDE_INT vect_align_c;
1326   if (!vector_alignment.is_constant (&vect_align_c))
1327     return;
1328
1329   /* No step for BB vectorization.  */
1330   if (!loop)
1331     {
1332       gcc_assert (integer_zerop (drb->step));
1333       step_preserves_misalignment_p = true;
1334     }
1335
1336   /* In case the dataref is in an inner-loop of the loop that is being
1337      vectorized (LOOP), we use the base and misalignment information
1338      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1339      stays the same throughout the execution of the inner-loop, which is why
1340      we have to check that the stride of the dataref in the inner-loop evenly
1341      divides by the vector alignment.  */
1342   else if (nested_in_vect_loop_p (loop, stmt_info))
1343     {
1344       step_preserves_misalignment_p
1345         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1346
1347       if (dump_enabled_p ())
1348         {
1349           if (step_preserves_misalignment_p)
1350             dump_printf_loc (MSG_NOTE, vect_location,
1351                              "inner step divides the vector alignment.\n");
1352           else
1353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354                              "inner step doesn't divide the vector"
1355                              " alignment.\n");
1356         }
1357     }
1358
1359   /* Similarly we can only use base and misalignment information relative to
1360      an innermost loop if the misalignment stays the same throughout the
1361      execution of the loop.  As above, this is the case if the stride of
1362      the dataref evenly divides by the alignment.  */
1363   else
1364     {
1365       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1366       step_preserves_misalignment_p
1367         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1368
1369       if (!step_preserves_misalignment_p && dump_enabled_p ())
1370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                          "step doesn't divide the vector alignment.\n");
1372     }
1373
1374   unsigned int base_alignment = drb->base_alignment;
1375   unsigned int base_misalignment = drb->base_misalignment;
1376
1377   /* Calculate the maximum of the pooled base address alignment and the
1378      alignment that we can compute for DR itself.  */
1379   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1380     = base_alignments->get (drb->base_address);
1381   if (entry
1382       && base_alignment < (*entry).second->base_alignment
1383       && (loop_vinfo
1384           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1385                               gimple_bb (entry->first->stmt))
1386               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1387                   || (entry->first->dr_aux.group <= dr_info->group)))))
1388     {
1389       base_alignment = entry->second->base_alignment;
1390       base_misalignment = entry->second->base_misalignment;
1391     }
1392
1393   if (drb->offset_alignment < vect_align_c
1394       || !step_preserves_misalignment_p
1395       /* We need to know whether the step wrt the vectorized loop is
1396          negative when computing the starting misalignment below.  */
1397       || TREE_CODE (drb->step) != INTEGER_CST)
1398     {
1399       if (dump_enabled_p ())
1400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                          "Unknown alignment for access: %T\n", ref);
1402       return;
1403     }
1404
1405   if (base_alignment < vect_align_c)
1406     {
1407       unsigned int max_alignment;
1408       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1409       if (max_alignment < vect_align_c
1410           || !vect_can_force_dr_alignment_p (base,
1411                                              vect_align_c * BITS_PER_UNIT))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_NOTE, vect_location,
1415                              "can't force alignment of ref: %T\n", ref);
1416           return;
1417         }
1418
1419       /* Force the alignment of the decl.
1420          NOTE: This is the only change to the code we make during
1421          the analysis phase, before deciding to vectorize the loop.  */
1422       if (dump_enabled_p ())
1423         dump_printf_loc (MSG_NOTE, vect_location,
1424                          "force alignment of %T\n", ref);
1425
1426       dr_info->base_decl = base;
1427       dr_info->base_misaligned = true;
1428       base_misalignment = 0;
1429     }
1430   poly_int64 misalignment
1431     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1432
1433   unsigned int const_misalignment;
1434   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1435     {
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                          "Non-constant misalignment for access: %T\n", ref);
1439       return;
1440     }
1441
1442   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1443
1444   if (dump_enabled_p ())
1445     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446                      "misalign = %d bytes of ref %T\n",
1447                      const_misalignment, ref);
1448
1449   return;
1450 }
1451
1452 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1453    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1454    is made aligned via peeling.  */
1455
1456 static bool
1457 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1458                                          dr_vec_info *dr_peel_info)
1459 {
1460   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1461                   DR_TARGET_ALIGNMENT (dr_info)))
1462     {
1463       poly_offset_int diff
1464         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1465            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1466       if (known_eq (diff, 0)
1467           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1468         return true;
1469     }
1470   return false;
1471 }
1472
1473 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1474    aligned via peeling.  */
1475
1476 static bool
1477 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1478                                  dr_vec_info *dr_peel_info)
1479 {
1480   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1481                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1482       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1483                            DR_OFFSET (dr_peel_info->dr), 0)
1484       || !operand_equal_p (DR_STEP (dr_info->dr),
1485                            DR_STEP (dr_peel_info->dr), 0))
1486     return false;
1487
1488   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1489 }
1490
1491 /* Compute the value for dr_info->misalign so that the access appears
1492    aligned.  This is used by peeling to compensate for dr_misalignment
1493    applying the offset for negative step.  */
1494
1495 int
1496 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1497 {
1498   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1499     return 0;
1500
1501   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1502   poly_int64 misalignment
1503     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1504        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1505
1506   unsigned HOST_WIDE_INT target_alignment_c;
1507   int misalign;
1508   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1509       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1510     return DR_MISALIGNMENT_UNKNOWN;
1511   return misalign;
1512 }
1513
1514 /* Function vect_update_misalignment_for_peel.
1515    Sets DR_INFO's misalignment
1516    - to 0 if it has the same alignment as DR_PEEL_INFO,
1517    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1518    - to -1 (unknown) otherwise.
1519
1520    DR_INFO - the data reference whose misalignment is to be adjusted.
1521    DR_PEEL_INFO - the data reference whose misalignment is being made
1522                   zero in the vector loop by the peel.
1523    NPEEL - the number of iterations in the peel loop if the misalignment
1524            of DR_PEEL_INFO is known at compile time.  */
1525
1526 static void
1527 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1528                                    dr_vec_info *dr_peel_info, int npeel)
1529 {
1530   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1531   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1532     {
1533       SET_DR_MISALIGNMENT (dr_info,
1534                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1535       return;
1536     }
1537
1538   unsigned HOST_WIDE_INT alignment;
1539   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1540       && known_alignment_for_access_p (dr_info,
1541                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1542       && known_alignment_for_access_p (dr_peel_info,
1543                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1544     {
1545       int misal = dr_info->misalignment;
1546       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1547       misal &= alignment - 1;
1548       set_dr_misalignment (dr_info, misal);
1549       return;
1550     }
1551
1552   if (dump_enabled_p ())
1553     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1554                      "to unknown (-1).\n");
1555   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1556 }
1557
1558 /* Return true if alignment is relevant for DR_INFO.  */
1559
1560 static bool
1561 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1562 {
1563   stmt_vec_info stmt_info = dr_info->stmt;
1564
1565   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1566     return false;
1567
1568   /* For interleaving, only the alignment of the first access matters.  */
1569   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1570       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1571     return false;
1572
1573   /* Scatter-gather and invariant accesses continue to address individual
1574      scalars, so vector-level alignment is irrelevant.  */
1575   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1576       || integer_zerop (DR_STEP (dr_info->dr)))
1577     return false;
1578
1579   /* Strided accesses perform only component accesses, alignment is
1580      irrelevant for them.  */
1581   if (STMT_VINFO_STRIDED_P (stmt_info)
1582       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583     return false;
1584
1585   return true;
1586 }
1587
1588 /* Given an memory reference EXP return whether its alignment is less
1589    than its size.  */
1590
1591 static bool
1592 not_size_aligned (tree exp)
1593 {
1594   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1595     return true;
1596
1597   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1598           > get_object_alignment (exp));
1599 }
1600
1601 /* Function vector_alignment_reachable_p
1602
1603    Return true if vector alignment for DR_INFO is reachable by peeling
1604    a few loop iterations.  Return false otherwise.  */
1605
1606 static bool
1607 vector_alignment_reachable_p (dr_vec_info *dr_info)
1608 {
1609   stmt_vec_info stmt_info = dr_info->stmt;
1610   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1611
1612   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613     {
1614       /* For interleaved access we peel only if number of iterations in
1615          the prolog loop ({VF - misalignment}), is a multiple of the
1616          number of the interleaved accesses.  */
1617       int elem_size, mis_in_elements;
1618
1619       /* FORNOW: handle only known alignment.  */
1620       if (!known_alignment_for_access_p (dr_info, vectype))
1621         return false;
1622
1623       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1624       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1625       elem_size = vector_element_size (vector_size, nelements);
1626       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1627
1628       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1629         return false;
1630     }
1631
1632   /* If misalignment is known at the compile time then allow peeling
1633      only if natural alignment is reachable through peeling.  */
1634   if (known_alignment_for_access_p (dr_info, vectype)
1635       && !aligned_access_p (dr_info, vectype))
1636     {
1637       HOST_WIDE_INT elmsize =
1638                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1639       if (dump_enabled_p ())
1640         {
1641           dump_printf_loc (MSG_NOTE, vect_location,
1642                            "data size = %wd. misalignment = %d.\n", elmsize,
1643                            dr_misalignment (dr_info, vectype));
1644         }
1645       if (dr_misalignment (dr_info, vectype) % elmsize)
1646         {
1647           if (dump_enabled_p ())
1648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                              "data size does not divide the misalignment.\n");
1650           return false;
1651         }
1652     }
1653
1654   if (!known_alignment_for_access_p (dr_info, vectype))
1655     {
1656       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1657       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "Unknown misalignment, %snaturally aligned\n",
1661                          is_packed ? "not " : "");
1662       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1663     }
1664
1665   return true;
1666 }
1667
1668
1669 /* Calculate the cost of the memory access represented by DR_INFO.  */
1670
1671 static void
1672 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1673                            dr_alignment_support alignment_support_scheme,
1674                            int misalignment,
1675                            unsigned int *inside_cost,
1676                            unsigned int *outside_cost,
1677                            stmt_vector_for_cost *body_cost_vec,
1678                            stmt_vector_for_cost *prologue_cost_vec)
1679 {
1680   stmt_vec_info stmt_info = dr_info->stmt;
1681   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1682   int ncopies;
1683
1684   if (PURE_SLP_STMT (stmt_info))
1685     ncopies = 1;
1686   else
1687     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1688
1689   if (DR_IS_READ (dr_info->dr))
1690     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1691                         misalignment, true, inside_cost,
1692                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1693   else
1694     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1695                          misalignment, inside_cost, body_cost_vec);
1696
1697   if (dump_enabled_p ())
1698     dump_printf_loc (MSG_NOTE, vect_location,
1699                      "vect_get_data_access_cost: inside_cost = %d, "
1700                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1701 }
1702
1703
1704 typedef struct _vect_peel_info
1705 {
1706   dr_vec_info *dr_info;
1707   int npeel;
1708   unsigned int count;
1709 } *vect_peel_info;
1710
1711 typedef struct _vect_peel_extended_info
1712 {
1713   vec_info *vinfo;
1714   struct _vect_peel_info peel_info;
1715   unsigned int inside_cost;
1716   unsigned int outside_cost;
1717 } *vect_peel_extended_info;
1718
1719
1720 /* Peeling hashtable helpers.  */
1721
1722 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1723 {
1724   static inline hashval_t hash (const _vect_peel_info *);
1725   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1726 };
1727
1728 inline hashval_t
1729 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1730 {
1731   return (hashval_t) peel_info->npeel;
1732 }
1733
1734 inline bool
1735 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1736 {
1737   return (a->npeel == b->npeel);
1738 }
1739
1740
1741 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1742
1743 static void
1744 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1745                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1746                           int npeel, bool supportable_if_not_aligned)
1747 {
1748   struct _vect_peel_info elem, *slot;
1749   _vect_peel_info **new_slot;
1750
1751   elem.npeel = npeel;
1752   slot = peeling_htab->find (&elem);
1753   if (slot)
1754     slot->count++;
1755   else
1756     {
1757       slot = XNEW (struct _vect_peel_info);
1758       slot->npeel = npeel;
1759       slot->dr_info = dr_info;
1760       slot->count = 1;
1761       new_slot = peeling_htab->find_slot (slot, INSERT);
1762       *new_slot = slot;
1763     }
1764
1765   /* If this DR is not supported with unknown misalignment then bias
1766      this slot when the cost model is disabled.  */
1767   if (!supportable_if_not_aligned
1768       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1769     slot->count += VECT_MAX_COST;
1770 }
1771
1772
1773 /* Traverse peeling hash table to find peeling option that aligns maximum
1774    number of data accesses.  */
1775
1776 int
1777 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1778                                      _vect_peel_extended_info *max)
1779 {
1780   vect_peel_info elem = *slot;
1781
1782   if (elem->count > max->peel_info.count
1783       || (elem->count == max->peel_info.count
1784           && max->peel_info.npeel > elem->npeel))
1785     {
1786       max->peel_info.npeel = elem->npeel;
1787       max->peel_info.count = elem->count;
1788       max->peel_info.dr_info = elem->dr_info;
1789     }
1790
1791   return 1;
1792 }
1793
1794 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1795    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1796    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1797    after peeling.  */
1798
1799 static void
1800 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1801                                 dr_vec_info *dr0_info,
1802                                 unsigned int *inside_cost,
1803                                 unsigned int *outside_cost,
1804                                 stmt_vector_for_cost *body_cost_vec,
1805                                 stmt_vector_for_cost *prologue_cost_vec,
1806                                 unsigned int npeel)
1807 {
1808   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1809
1810   bool dr0_alignment_known_p
1811     = (dr0_info
1812        && known_alignment_for_access_p (dr0_info,
1813                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1814
1815   for (data_reference *dr : datarefs)
1816     {
1817       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1818       if (!vect_relevant_for_alignment_p (dr_info))
1819         continue;
1820
1821       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1822       dr_alignment_support alignment_support_scheme;
1823       int misalignment;
1824       unsigned HOST_WIDE_INT alignment;
1825
1826       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1827                                             size_zero_node) < 0;
1828       poly_int64 off = 0;
1829       if (negative)
1830         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1831                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1832
1833       if (npeel == 0)
1834         misalignment = dr_misalignment (dr_info, vectype, off);
1835       else if (dr_info == dr0_info
1836                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1837         misalignment = 0;
1838       else if (!dr0_alignment_known_p
1839                || !known_alignment_for_access_p (dr_info, vectype)
1840                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1841         misalignment = DR_MISALIGNMENT_UNKNOWN;
1842       else
1843         {
1844           misalignment = dr_misalignment (dr_info, vectype, off);
1845           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1846           misalignment &= alignment - 1;
1847         }
1848       alignment_support_scheme
1849         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1850                                          misalignment);
1851
1852       vect_get_data_access_cost (loop_vinfo, dr_info,
1853                                  alignment_support_scheme, misalignment,
1854                                  inside_cost, outside_cost,
1855                                  body_cost_vec, prologue_cost_vec);
1856     }
1857 }
1858
1859 /* Traverse peeling hash table and calculate cost for each peeling option.
1860    Find the one with the lowest cost.  */
1861
1862 int
1863 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1864                                    _vect_peel_extended_info *min)
1865 {
1866   vect_peel_info elem = *slot;
1867   int dummy;
1868   unsigned int inside_cost = 0, outside_cost = 0;
1869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1870   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1871                        epilogue_cost_vec;
1872
1873   prologue_cost_vec.create (2);
1874   body_cost_vec.create (2);
1875   epilogue_cost_vec.create (2);
1876
1877   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1878                                   &outside_cost, &body_cost_vec,
1879                                   &prologue_cost_vec, elem->npeel);
1880
1881   body_cost_vec.release ();
1882
1883   outside_cost += vect_get_known_peeling_cost
1884     (loop_vinfo, elem->npeel, &dummy,
1885      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1886      &prologue_cost_vec, &epilogue_cost_vec);
1887
1888   /* Prologue and epilogue costs are added to the target model later.
1889      These costs depend only on the scalar iteration cost, the
1890      number of peeling iterations finally chosen, and the number of
1891      misaligned statements.  So discard the information found here.  */
1892   prologue_cost_vec.release ();
1893   epilogue_cost_vec.release ();
1894
1895   if (inside_cost < min->inside_cost
1896       || (inside_cost == min->inside_cost
1897           && outside_cost < min->outside_cost))
1898     {
1899       min->inside_cost = inside_cost;
1900       min->outside_cost = outside_cost;
1901       min->peel_info.dr_info = elem->dr_info;
1902       min->peel_info.npeel = elem->npeel;
1903       min->peel_info.count = elem->count;
1904     }
1905
1906   return 1;
1907 }
1908
1909
1910 /* Choose best peeling option by traversing peeling hash table and either
1911    choosing an option with the lowest cost (if cost model is enabled) or the
1912    option that aligns as many accesses as possible.  */
1913
1914 static struct _vect_peel_extended_info
1915 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1916                                        loop_vec_info loop_vinfo)
1917 {
1918    struct _vect_peel_extended_info res;
1919
1920    res.peel_info.dr_info = NULL;
1921    res.vinfo = loop_vinfo;
1922
1923    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1924      {
1925        res.inside_cost = INT_MAX;
1926        res.outside_cost = INT_MAX;
1927        peeling_htab->traverse <_vect_peel_extended_info *,
1928                                vect_peeling_hash_get_lowest_cost> (&res);
1929      }
1930    else
1931      {
1932        res.peel_info.count = 0;
1933        peeling_htab->traverse <_vect_peel_extended_info *,
1934                                vect_peeling_hash_get_most_frequent> (&res);
1935        res.inside_cost = 0;
1936        res.outside_cost = 0;
1937      }
1938
1939    return res;
1940 }
1941
1942 /* Return true if the new peeling NPEEL is supported.  */
1943
1944 static bool
1945 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1946                           unsigned npeel)
1947 {
1948   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1949   enum dr_alignment_support supportable_dr_alignment;
1950
1951   bool dr0_alignment_known_p
1952     = known_alignment_for_access_p (dr0_info,
1953                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1954
1955   /* Ensure that all data refs can be vectorized after the peel.  */
1956   for (data_reference *dr : datarefs)
1957     {
1958       if (dr == dr0_info->dr)
1959         continue;
1960
1961       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1962       if (!vect_relevant_for_alignment_p (dr_info)
1963           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1964         continue;
1965
1966       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1967       int misalignment;
1968       unsigned HOST_WIDE_INT alignment;
1969       if (!dr0_alignment_known_p
1970           || !known_alignment_for_access_p (dr_info, vectype)
1971           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1972         misalignment = DR_MISALIGNMENT_UNKNOWN;
1973       else
1974         {
1975           misalignment = dr_misalignment (dr_info, vectype);
1976           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1977           misalignment &= alignment - 1;
1978         }
1979       supportable_dr_alignment
1980         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1981                                          misalignment);
1982       if (supportable_dr_alignment == dr_unaligned_unsupported)
1983         return false;
1984     }
1985
1986   return true;
1987 }
1988
1989 /* Compare two data-references DRA and DRB to group them into chunks
1990    with related alignment.  */
1991
1992 static int
1993 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1994 {
1995   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1996   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1997   int cmp;
1998
1999   /* Stabilize sort.  */
2000   if (dra == drb)
2001     return 0;
2002
2003   /* Ordering of DRs according to base.  */
2004   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2005                                DR_BASE_ADDRESS (drb));
2006   if (cmp != 0)
2007     return cmp;
2008
2009   /* And according to DR_OFFSET.  */
2010   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2011   if (cmp != 0)
2012     return cmp;
2013
2014   /* And after step.  */
2015   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2016   if (cmp != 0)
2017     return cmp;
2018
2019   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2020   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2021   if (cmp == 0)
2022     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2023   return cmp;
2024 }
2025
2026 /* Function vect_enhance_data_refs_alignment
2027
2028    This pass will use loop versioning and loop peeling in order to enhance
2029    the alignment of data references in the loop.
2030
2031    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2032    original loop is to be vectorized.  Any other loops that are created by
2033    the transformations performed in this pass - are not supposed to be
2034    vectorized.  This restriction will be relaxed.
2035
2036    This pass will require a cost model to guide it whether to apply peeling
2037    or versioning or a combination of the two.  For example, the scheme that
2038    intel uses when given a loop with several memory accesses, is as follows:
2039    choose one memory access ('p') which alignment you want to force by doing
2040    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2041    other accesses are not necessarily aligned, or (2) use loop versioning to
2042    generate one loop in which all accesses are aligned, and another loop in
2043    which only 'p' is necessarily aligned.
2044
2045    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2046    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2047    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2048
2049    Devising a cost model is the most critical aspect of this work.  It will
2050    guide us on which access to peel for, whether to use loop versioning, how
2051    many versions to create, etc.  The cost model will probably consist of
2052    generic considerations as well as target specific considerations (on
2053    powerpc for example, misaligned stores are more painful than misaligned
2054    loads).
2055
2056    Here are the general steps involved in alignment enhancements:
2057
2058      -- original loop, before alignment analysis:
2059         for (i=0; i<N; i++){
2060           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2061           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2062         }
2063
2064      -- After vect_compute_data_refs_alignment:
2065         for (i=0; i<N; i++){
2066           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2067           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2068         }
2069
2070      -- Possibility 1: we do loop versioning:
2071      if (p is aligned) {
2072         for (i=0; i<N; i++){    # loop 1A
2073           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2074           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2075         }
2076      }
2077      else {
2078         for (i=0; i<N; i++){    # loop 1B
2079           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2080           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2081         }
2082      }
2083
2084      -- Possibility 2: we do loop peeling:
2085      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2086         x = q[i];
2087         p[i] = y;
2088      }
2089      for (i = 3; i < N; i++){   # loop 2A
2090         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2091         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2092      }
2093
2094      -- Possibility 3: combination of loop peeling and versioning:
2095      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2096         x = q[i];
2097         p[i] = y;
2098      }
2099      if (p is aligned) {
2100         for (i = 3; i<N; i++){  # loop 3A
2101           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2102           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2103         }
2104      }
2105      else {
2106         for (i = 3; i<N; i++){  # loop 3B
2107           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2108           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2109         }
2110      }
2111
2112      These loops are later passed to loop_transform to be vectorized.  The
2113      vectorizer will use the alignment information to guide the transformation
2114      (whether to generate regular loads/stores, or with special handling for
2115      misalignment).  */
2116
2117 opt_result
2118 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2119 {
2120   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2121   dr_vec_info *first_store = NULL;
2122   dr_vec_info *dr0_info = NULL;
2123   struct data_reference *dr;
2124   unsigned int i;
2125   bool do_peeling = false;
2126   bool do_versioning = false;
2127   unsigned int npeel = 0;
2128   bool one_misalignment_known = false;
2129   bool one_misalignment_unknown = false;
2130   bool one_dr_unsupportable = false;
2131   dr_vec_info *unsupportable_dr_info = NULL;
2132   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2133   hash_table<peel_info_hasher> peeling_htab (1);
2134
2135   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2136
2137   /* Reset data so we can safely be called multiple times.  */
2138   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2139   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2140
2141   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2142     return opt_result::success ();
2143
2144   /* Sort the vector of datarefs so DRs that have the same or dependent
2145      alignment are next to each other.  */
2146   auto_vec<data_reference_p> datarefs
2147     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2148   datarefs.qsort (dr_align_group_sort_cmp);
2149
2150   /* Compute the number of DRs that become aligned when we peel
2151      a dataref so it becomes aligned.  */
2152   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2153   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2154   unsigned i0;
2155   for (i0 = 0; i0 < datarefs.length (); ++i0)
2156     if (DR_BASE_ADDRESS (datarefs[i0]))
2157       break;
2158   for (i = i0 + 1; i <= datarefs.length (); ++i)
2159     {
2160       if (i == datarefs.length ()
2161           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2162                                DR_BASE_ADDRESS (datarefs[i]), 0)
2163           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2164                                DR_OFFSET (datarefs[i]), 0)
2165           || !operand_equal_p (DR_STEP (datarefs[i0]),
2166                                DR_STEP (datarefs[i]), 0))
2167         {
2168           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2169              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2170              will get known misalignment if we align one of the refs
2171              with the largest DR_TARGET_ALIGNMENT.  */
2172           for (unsigned j = i0; j < i; ++j)
2173             {
2174               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2175               for (unsigned k = i0; k < i; ++k)
2176                 {
2177                   if (k == j)
2178                     continue;
2179                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2180                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2181                                                                dr_infoj))
2182                     n_same_align_refs[j]++;
2183                 }
2184             }
2185           i0 = i;
2186         }
2187     }
2188
2189   /* While cost model enhancements are expected in the future, the high level
2190      view of the code at this time is as follows:
2191
2192      A) If there is a misaligned access then see if peeling to align
2193         this access can make all data references satisfy
2194         vect_supportable_dr_alignment.  If so, update data structures
2195         as needed and return true.
2196
2197      B) If peeling wasn't possible and there is a data reference with an
2198         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2199         then see if loop versioning checks can be used to make all data
2200         references satisfy vect_supportable_dr_alignment.  If so, update
2201         data structures as needed and return true.
2202
2203      C) If neither peeling nor versioning were successful then return false if
2204         any data reference does not satisfy vect_supportable_dr_alignment.
2205
2206      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2207
2208      Note, Possibility 3 above (which is peeling and versioning together) is not
2209      being done at this time.  */
2210
2211   /* (1) Peeling to force alignment.  */
2212
2213   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2214      Considerations:
2215      + How many accesses will become aligned due to the peeling
2216      - How many accesses will become unaligned due to the peeling,
2217        and the cost of misaligned accesses.
2218      - The cost of peeling (the extra runtime checks, the increase
2219        in code size).  */
2220
2221   FOR_EACH_VEC_ELT (datarefs, i, dr)
2222     {
2223       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224       if (!vect_relevant_for_alignment_p (dr_info))
2225         continue;
2226
2227       stmt_vec_info stmt_info = dr_info->stmt;
2228       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2229       do_peeling = vector_alignment_reachable_p (dr_info);
2230       if (do_peeling)
2231         {
2232           if (known_alignment_for_access_p (dr_info, vectype))
2233             {
2234               unsigned int npeel_tmp = 0;
2235               bool negative = tree_int_cst_compare (DR_STEP (dr),
2236                                                     size_zero_node) < 0;
2237
2238               /* If known_alignment_for_access_p then we have set
2239                  DR_MISALIGNMENT which is only done if we know it at compiler
2240                  time, so it is safe to assume target alignment is constant.
2241                */
2242               unsigned int target_align =
2243                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2244               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2245               poly_int64 off = 0;
2246               if (negative)
2247                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2248               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2249               mis = negative ? mis : -mis;
2250               if (mis != 0)
2251                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2252
2253               /* For multiple types, it is possible that the bigger type access
2254                  will have more than one peeling option.  E.g., a loop with two
2255                  types: one of size (vector size / 4), and the other one of
2256                  size (vector size / 8).  Vectorization factor will 8.  If both
2257                  accesses are misaligned by 3, the first one needs one scalar
2258                  iteration to be aligned, and the second one needs 5.  But the
2259                  first one will be aligned also by peeling 5 scalar
2260                  iterations, and in that case both accesses will be aligned.
2261                  Hence, except for the immediate peeling amount, we also want
2262                  to try to add full vector size, while we don't exceed
2263                  vectorization factor.
2264                  We do this automatically for cost model, since we calculate
2265                  cost for every peeling option.  */
2266               poly_uint64 nscalars = npeel_tmp;
2267               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2268                 {
2269                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2270                   nscalars = (STMT_SLP_TYPE (stmt_info)
2271                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2272                 }
2273
2274               /* Save info about DR in the hash table.  Also include peeling
2275                  amounts according to the explanation above.  Indicate
2276                  the alignment status when the ref is not aligned.
2277                  ???  Rather than using unknown alignment here we should
2278                  prune all entries from the peeling hashtable which cause
2279                  DRs to be not supported.  */
2280               bool supportable_if_not_aligned
2281                 = vect_supportable_dr_alignment
2282                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2283               while (known_le (npeel_tmp, nscalars))
2284                 {
2285                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2286                                             dr_info, npeel_tmp,
2287                                             supportable_if_not_aligned);
2288                   npeel_tmp += MAX (1, target_align / dr_size);
2289                 }
2290
2291               one_misalignment_known = true;
2292             }
2293           else
2294             {
2295               /* If we don't know any misalignment values, we prefer
2296                  peeling for data-ref that has the maximum number of data-refs
2297                  with the same alignment, unless the target prefers to align
2298                  stores over load.  */
2299               unsigned same_align_drs = n_same_align_refs[i];
2300               if (!dr0_info
2301                   || dr0_same_align_drs < same_align_drs)
2302                 {
2303                   dr0_same_align_drs = same_align_drs;
2304                   dr0_info = dr_info;
2305                 }
2306               /* For data-refs with the same number of related
2307                  accesses prefer the one where the misalign
2308                  computation will be invariant in the outermost loop.  */
2309               else if (dr0_same_align_drs == same_align_drs)
2310                 {
2311                   class loop *ivloop0, *ivloop;
2312                   ivloop0 = outermost_invariant_loop_for_expr
2313                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2314                   ivloop = outermost_invariant_loop_for_expr
2315                     (loop, DR_BASE_ADDRESS (dr));
2316                   if ((ivloop && !ivloop0)
2317                       || (ivloop && ivloop0
2318                           && flow_loop_nested_p (ivloop, ivloop0)))
2319                     dr0_info = dr_info;
2320                 }
2321
2322               one_misalignment_unknown = true;
2323
2324               /* Check for data refs with unsupportable alignment that
2325                  can be peeled.  */
2326               enum dr_alignment_support supportable_dr_alignment
2327                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2328                                                  DR_MISALIGNMENT_UNKNOWN);
2329               if (supportable_dr_alignment == dr_unaligned_unsupported)
2330                 {
2331                   one_dr_unsupportable = true;
2332                   unsupportable_dr_info = dr_info;
2333                 }
2334
2335               if (!first_store && DR_IS_WRITE (dr))
2336                 {
2337                   first_store = dr_info;
2338                   first_store_same_align_drs = same_align_drs;
2339                 }
2340             }
2341         }
2342       else
2343         {
2344           if (!aligned_access_p (dr_info, vectype))
2345             {
2346               if (dump_enabled_p ())
2347                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348                                  "vector alignment may not be reachable\n");
2349               break;
2350             }
2351         }
2352     }
2353
2354   /* Check if we can possibly peel the loop.  */
2355   if (!vect_can_advance_ivs_p (loop_vinfo)
2356       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2357                                        LOOP_VINFO_IV_EXIT (loop_vinfo))
2358       || loop->inner)
2359     do_peeling = false;
2360
2361   struct _vect_peel_extended_info peel_for_known_alignment;
2362   struct _vect_peel_extended_info peel_for_unknown_alignment;
2363   struct _vect_peel_extended_info best_peel;
2364
2365   peel_for_unknown_alignment.inside_cost = INT_MAX;
2366   peel_for_unknown_alignment.outside_cost = INT_MAX;
2367   peel_for_unknown_alignment.peel_info.count = 0;
2368
2369   if (do_peeling
2370       && one_misalignment_unknown)
2371     {
2372       /* Check if the target requires to prefer stores over loads, i.e., if
2373          misaligned stores are more expensive than misaligned loads (taking
2374          drs with same alignment into account).  */
2375       unsigned int load_inside_cost = 0;
2376       unsigned int load_outside_cost = 0;
2377       unsigned int store_inside_cost = 0;
2378       unsigned int store_outside_cost = 0;
2379       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2380
2381       stmt_vector_for_cost dummy;
2382       dummy.create (2);
2383       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2384                                       &load_inside_cost,
2385                                       &load_outside_cost,
2386                                       &dummy, &dummy, estimated_npeels);
2387       dummy.release ();
2388
2389       if (first_store)
2390         {
2391           dummy.create (2);
2392           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2393                                           &store_inside_cost,
2394                                           &store_outside_cost,
2395                                           &dummy, &dummy,
2396                                           estimated_npeels);
2397           dummy.release ();
2398         }
2399       else
2400         {
2401           store_inside_cost = INT_MAX;
2402           store_outside_cost = INT_MAX;
2403         }
2404
2405       if (load_inside_cost > store_inside_cost
2406           || (load_inside_cost == store_inside_cost
2407               && load_outside_cost > store_outside_cost))
2408         {
2409           dr0_info = first_store;
2410           dr0_same_align_drs = first_store_same_align_drs;
2411           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2412           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2413         }
2414       else
2415         {
2416           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2417           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2418         }
2419
2420       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2421       prologue_cost_vec.create (2);
2422       epilogue_cost_vec.create (2);
2423
2424       int dummy2;
2425       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2426         (loop_vinfo, estimated_npeels, &dummy2,
2427          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2428          &prologue_cost_vec, &epilogue_cost_vec);
2429
2430       prologue_cost_vec.release ();
2431       epilogue_cost_vec.release ();
2432
2433       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2434     }
2435
2436   peel_for_unknown_alignment.peel_info.npeel = 0;
2437   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2438
2439   best_peel = peel_for_unknown_alignment;
2440
2441   peel_for_known_alignment.inside_cost = INT_MAX;
2442   peel_for_known_alignment.outside_cost = INT_MAX;
2443   peel_for_known_alignment.peel_info.count = 0;
2444   peel_for_known_alignment.peel_info.dr_info = NULL;
2445
2446   if (do_peeling && one_misalignment_known)
2447     {
2448       /* Peeling is possible, but there is no data access that is not supported
2449          unless aligned.  So we try to choose the best possible peeling from
2450          the hash table.  */
2451       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2452         (&peeling_htab, loop_vinfo);
2453     }
2454
2455   /* Compare costs of peeling for known and unknown alignment. */
2456   if (peel_for_known_alignment.peel_info.dr_info != NULL
2457       && peel_for_unknown_alignment.inside_cost
2458       >= peel_for_known_alignment.inside_cost)
2459     {
2460       best_peel = peel_for_known_alignment;
2461
2462       /* If the best peeling for known alignment has NPEEL == 0, perform no
2463          peeling at all except if there is an unsupportable dr that we can
2464          align.  */
2465       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2466         do_peeling = false;
2467     }
2468
2469   /* If there is an unsupportable data ref, prefer this over all choices so far
2470      since we'd have to discard a chosen peeling except when it accidentally
2471      aligned the unsupportable data ref.  */
2472   if (one_dr_unsupportable)
2473     dr0_info = unsupportable_dr_info;
2474   else if (do_peeling)
2475     {
2476       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2477          TODO: Use nopeel_outside_cost or get rid of it?  */
2478       unsigned nopeel_inside_cost = 0;
2479       unsigned nopeel_outside_cost = 0;
2480
2481       stmt_vector_for_cost dummy;
2482       dummy.create (2);
2483       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2484                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2485       dummy.release ();
2486
2487       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2488          costs will be recorded.  */
2489       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2490       prologue_cost_vec.create (2);
2491       epilogue_cost_vec.create (2);
2492
2493       int dummy2;
2494       nopeel_outside_cost += vect_get_known_peeling_cost
2495         (loop_vinfo, 0, &dummy2,
2496          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2497          &prologue_cost_vec, &epilogue_cost_vec);
2498
2499       prologue_cost_vec.release ();
2500       epilogue_cost_vec.release ();
2501
2502       npeel = best_peel.peel_info.npeel;
2503       dr0_info = best_peel.peel_info.dr_info;
2504
2505       /* If no peeling is not more expensive than the best peeling we
2506          have so far, don't perform any peeling.  */
2507       if (nopeel_inside_cost <= best_peel.inside_cost)
2508         do_peeling = false;
2509     }
2510
2511   if (do_peeling)
2512     {
2513       stmt_vec_info stmt_info = dr0_info->stmt;
2514       if (known_alignment_for_access_p (dr0_info,
2515                                         STMT_VINFO_VECTYPE (stmt_info)))
2516         {
2517           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2518                                                 size_zero_node) < 0;
2519           if (!npeel)
2520             {
2521               /* Since it's known at compile time, compute the number of
2522                  iterations in the peeled loop (the peeling factor) for use in
2523                  updating DR_MISALIGNMENT values.  The peeling factor is the
2524                  vectorization factor minus the misalignment as an element
2525                  count.  */
2526               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2527               poly_int64 off = 0;
2528               if (negative)
2529                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2530                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2531               unsigned int mis
2532                 = dr_misalignment (dr0_info, vectype, off);
2533               mis = negative ? mis : -mis;
2534               /* If known_alignment_for_access_p then we have set
2535                  DR_MISALIGNMENT which is only done if we know it at compiler
2536                  time, so it is safe to assume target alignment is constant.
2537                */
2538               unsigned int target_align =
2539                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2540               npeel = ((mis & (target_align - 1))
2541                        / vect_get_scalar_dr_size (dr0_info));
2542             }
2543
2544           /* For interleaved data access every iteration accesses all the
2545              members of the group, therefore we divide the number of iterations
2546              by the group size.  */
2547           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2548             npeel /= DR_GROUP_SIZE (stmt_info);
2549
2550           if (dump_enabled_p ())
2551             dump_printf_loc (MSG_NOTE, vect_location,
2552                              "Try peeling by %d\n", npeel);
2553         }
2554
2555       /* Ensure that all datarefs can be vectorized after the peel.  */
2556       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2557         do_peeling = false;
2558
2559       /* Check if all datarefs are supportable and log.  */
2560       if (do_peeling
2561           && npeel == 0
2562           && known_alignment_for_access_p (dr0_info,
2563                                            STMT_VINFO_VECTYPE (stmt_info)))
2564         return opt_result::success ();
2565
2566       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2567       if (do_peeling)
2568         {
2569           unsigned max_allowed_peel
2570             = param_vect_max_peeling_for_alignment;
2571           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2572             max_allowed_peel = 0;
2573           if (max_allowed_peel != (unsigned)-1)
2574             {
2575               unsigned max_peel = npeel;
2576               if (max_peel == 0)
2577                 {
2578                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2579                   unsigned HOST_WIDE_INT target_align_c;
2580                   if (target_align.is_constant (&target_align_c))
2581                     max_peel =
2582                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2583                   else
2584                     {
2585                       do_peeling = false;
2586                       if (dump_enabled_p ())
2587                         dump_printf_loc (MSG_NOTE, vect_location,
2588                           "Disable peeling, max peels set and vector"
2589                           " alignment unknown\n");
2590                     }
2591                 }
2592               if (max_peel > max_allowed_peel)
2593                 {
2594                   do_peeling = false;
2595                   if (dump_enabled_p ())
2596                     dump_printf_loc (MSG_NOTE, vect_location,
2597                         "Disable peeling, max peels reached: %d\n", max_peel);
2598                 }
2599             }
2600         }
2601
2602       /* Cost model #2 - if peeling may result in a remaining loop not
2603          iterating enough to be vectorized then do not peel.  Since this
2604          is a cost heuristic rather than a correctness decision, use the
2605          most likely runtime value for variable vectorization factors.  */
2606       if (do_peeling
2607           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2608         {
2609           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2610           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2611           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2612               < assumed_vf + max_peel)
2613             do_peeling = false;
2614         }
2615
2616       if (do_peeling)
2617         {
2618           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2619              If the misalignment of DR_i is identical to that of dr0 then set
2620              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2621              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2622              by the peeling factor times the element size of DR_i (MOD the
2623              vectorization factor times the size).  Otherwise, the
2624              misalignment of DR_i must be set to unknown.  */
2625           FOR_EACH_VEC_ELT (datarefs, i, dr)
2626             if (dr != dr0_info->dr)
2627               {
2628                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2629                 if (!vect_relevant_for_alignment_p (dr_info))
2630                   continue;
2631
2632                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2633               }
2634
2635           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2636           if (npeel)
2637             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2638           else
2639             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2640           SET_DR_MISALIGNMENT (dr0_info,
2641                                vect_dr_misalign_for_aligned_access (dr0_info));
2642           if (dump_enabled_p ())
2643             {
2644               dump_printf_loc (MSG_NOTE, vect_location,
2645                                "Alignment of access forced using peeling.\n");
2646               dump_printf_loc (MSG_NOTE, vect_location,
2647                                "Peeling for alignment will be applied.\n");
2648             }
2649
2650           /* The inside-loop cost will be accounted for in vectorizable_load
2651              and vectorizable_store correctly with adjusted alignments.
2652              Drop the body_cst_vec on the floor here.  */
2653           return opt_result::success ();
2654         }
2655     }
2656
2657   /* (2) Versioning to force alignment.  */
2658
2659   /* Try versioning if:
2660      1) optimize loop for speed and the cost-model is not cheap
2661      2) there is at least one unsupported misaligned data ref with an unknown
2662         misalignment, and
2663      3) all misaligned data refs with a known misalignment are supported, and
2664      4) the number of runtime alignment checks is within reason.  */
2665
2666   do_versioning
2667     = (optimize_loop_nest_for_speed_p (loop)
2668        && !loop->inner /* FORNOW */
2669        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2670
2671   if (do_versioning)
2672     {
2673       FOR_EACH_VEC_ELT (datarefs, i, dr)
2674         {
2675           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2676           if (!vect_relevant_for_alignment_p (dr_info))
2677             continue;
2678
2679           stmt_vec_info stmt_info = dr_info->stmt;
2680           if (STMT_VINFO_STRIDED_P (stmt_info))
2681             {
2682               do_versioning = false;
2683               break;
2684             }
2685
2686           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2687           bool negative = tree_int_cst_compare (DR_STEP (dr),
2688                                                 size_zero_node) < 0;
2689           poly_int64 off = 0;
2690           if (negative)
2691             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2692                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2693           int misalignment;
2694           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2695             continue;
2696
2697           enum dr_alignment_support supportable_dr_alignment
2698             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2699                                              misalignment);
2700           if (supportable_dr_alignment == dr_unaligned_unsupported)
2701             {
2702               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2703                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2704                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2705                 {
2706                   do_versioning = false;
2707                   break;
2708                 }
2709
2710               /* At present we don't support versioning for alignment
2711                  with variable VF, since there's no guarantee that the
2712                  VF is a power of two.  We could relax this if we added
2713                  a way of enforcing a power-of-two size.  */
2714               unsigned HOST_WIDE_INT size;
2715               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2716                 {
2717                   do_versioning = false;
2718                   break;
2719                 }
2720
2721               /* Forcing alignment in the first iteration is no good if
2722                  we don't keep it across iterations.  For now, just disable
2723                  versioning in this case.
2724                  ?? We could actually unroll the loop to achieve the required
2725                  overall step alignment, and forcing the alignment could be
2726                  done by doing some iterations of the non-vectorized loop.  */
2727               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2728                                * DR_STEP_ALIGNMENT (dr),
2729                                DR_TARGET_ALIGNMENT (dr_info)))
2730                 {
2731                   do_versioning = false;
2732                   break;
2733                 }
2734
2735               /* The rightmost bits of an aligned address must be zeros.
2736                  Construct the mask needed for this test.  For example,
2737                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2738                  mask must be 15 = 0xf. */
2739               int mask = size - 1;
2740
2741               /* FORNOW: use the same mask to test all potentially unaligned
2742                  references in the loop.  */
2743               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2744                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2745                 {
2746                   do_versioning = false;
2747                   break;
2748                 }
2749
2750               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2751               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2752             }
2753         }
2754
2755       /* Versioning requires at least one misaligned data reference.  */
2756       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2757         do_versioning = false;
2758       else if (!do_versioning)
2759         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2760     }
2761
2762   if (do_versioning)
2763     {
2764       const vec<stmt_vec_info> &may_misalign_stmts
2765         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2766       stmt_vec_info stmt_info;
2767
2768       /* It can now be assumed that the data references in the statements
2769          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2770          of the loop being vectorized.  */
2771       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2772         {
2773           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2774           SET_DR_MISALIGNMENT (dr_info,
2775                                vect_dr_misalign_for_aligned_access (dr_info));
2776           if (dump_enabled_p ())
2777             dump_printf_loc (MSG_NOTE, vect_location,
2778                              "Alignment of access forced using versioning.\n");
2779         }
2780
2781       if (dump_enabled_p ())
2782         dump_printf_loc (MSG_NOTE, vect_location,
2783                          "Versioning for alignment will be applied.\n");
2784
2785       /* Peeling and versioning can't be done together at this time.  */
2786       gcc_assert (! (do_peeling && do_versioning));
2787
2788       return opt_result::success ();
2789     }
2790
2791   /* This point is reached if neither peeling nor versioning is being done.  */
2792   gcc_assert (! (do_peeling || do_versioning));
2793
2794   return opt_result::success ();
2795 }
2796
2797
2798 /* Function vect_analyze_data_refs_alignment
2799
2800    Analyze the alignment of the data-references in the loop.
2801    Return FALSE if a data reference is found that cannot be vectorized.  */
2802
2803 opt_result
2804 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2805 {
2806   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2807
2808   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2809   struct data_reference *dr;
2810   unsigned int i;
2811
2812   vect_record_base_alignments (loop_vinfo);
2813   FOR_EACH_VEC_ELT (datarefs, i, dr)
2814     {
2815       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2816       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2817         {
2818           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2819               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2820             continue;
2821           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2822                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2823         }
2824     }
2825
2826   return opt_result::success ();
2827 }
2828
2829
2830 /* Analyze alignment of DRs of stmts in NODE.  */
2831
2832 static bool
2833 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2834 {
2835   /* Alignment is maintained in the first element of the group.  */
2836   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2837   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2838   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2839   tree vectype = SLP_TREE_VECTYPE (node);
2840   poly_uint64 vector_alignment
2841     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2842                  BITS_PER_UNIT);
2843   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2844     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2845   /* Re-analyze alignment when we're facing a vectorization with a bigger
2846      alignment requirement.  */
2847   else if (known_lt (dr_info->target_alignment, vector_alignment))
2848     {
2849       poly_uint64 old_target_alignment = dr_info->target_alignment;
2850       int old_misalignment = dr_info->misalignment;
2851       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2852       /* But keep knowledge about a smaller alignment.  */
2853       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2854           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2855         {
2856           dr_info->target_alignment = old_target_alignment;
2857           dr_info->misalignment = old_misalignment;
2858         }
2859     }
2860   /* When we ever face unordered target alignments the first one wins in terms
2861      of analyzing and the other will become unknown in dr_misalignment.  */
2862   return true;
2863 }
2864
2865 /* Function vect_slp_analyze_instance_alignment
2866
2867    Analyze the alignment of the data-references in the SLP instance.
2868    Return FALSE if a data reference is found that cannot be vectorized.  */
2869
2870 bool
2871 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2872                                                 slp_instance instance)
2873 {
2874   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2875
2876   slp_tree node;
2877   unsigned i;
2878   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2879     if (! vect_slp_analyze_node_alignment (vinfo, node))
2880       return false;
2881
2882   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2883       && ! vect_slp_analyze_node_alignment
2884              (vinfo, SLP_INSTANCE_TREE (instance)))
2885     return false;
2886
2887   return true;
2888 }
2889
2890
2891 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2892    accesses of legal size, step, etc.  Detect gaps, single element
2893    interleaving, and other special cases. Set grouped access info.
2894    Collect groups of strided stores for further use in SLP analysis.
2895    Worker for vect_analyze_group_access.  */
2896
2897 static bool
2898 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2899 {
2900   data_reference *dr = dr_info->dr;
2901   tree step = DR_STEP (dr);
2902   tree scalar_type = TREE_TYPE (DR_REF (dr));
2903   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2904   stmt_vec_info stmt_info = dr_info->stmt;
2905   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2906   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2907   HOST_WIDE_INT dr_step = -1;
2908   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2909   bool slp_impossible = false;
2910
2911   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2912      size of the interleaving group (including gaps).  */
2913   if (tree_fits_shwi_p (step))
2914     {
2915       dr_step = tree_to_shwi (step);
2916       /* Check that STEP is a multiple of type size.  Otherwise there is
2917          a non-element-sized gap at the end of the group which we
2918          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2919          ???  As we can handle non-constant step fine here we should
2920          simply remove uses of DR_GROUP_GAP between the last and first
2921          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2922          simply not include that gap.  */
2923       if ((dr_step % type_size) != 0)
2924         {
2925           if (dump_enabled_p ())
2926             dump_printf_loc (MSG_NOTE, vect_location,
2927                              "Step %T is not a multiple of the element size"
2928                              " for %T\n",
2929                              step, DR_REF (dr));
2930           return false;
2931         }
2932       groupsize = absu_hwi (dr_step) / type_size;
2933     }
2934   else
2935     groupsize = 0;
2936
2937   /* Not consecutive access is possible only if it is a part of interleaving.  */
2938   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2939     {
2940       /* Check if it this DR is a part of interleaving, and is a single
2941          element of the group that is accessed in the loop.  */
2942
2943       /* Gaps are supported only for loads. STEP must be a multiple of the type
2944          size.  */
2945       if (DR_IS_READ (dr)
2946           && (dr_step % type_size) == 0
2947           && groupsize > 0
2948           /* This could be UINT_MAX but as we are generating code in a very
2949              inefficient way we have to cap earlier.
2950              See PR91403 for example.  */
2951           && groupsize <= 4096)
2952         {
2953           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2954           DR_GROUP_SIZE (stmt_info) = groupsize;
2955           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2956           if (dump_enabled_p ())
2957             dump_printf_loc (MSG_NOTE, vect_location,
2958                              "Detected single element interleaving %T"
2959                              " step %T\n",
2960                              DR_REF (dr), step);
2961
2962           return true;
2963         }
2964
2965       if (dump_enabled_p ())
2966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2967                          "not consecutive access %G", stmt_info->stmt);
2968
2969       if (bb_vinfo)
2970         {
2971           /* Mark the statement as unvectorizable.  */
2972           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2973           return true;
2974         }
2975
2976       if (dump_enabled_p ())
2977         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2978       STMT_VINFO_STRIDED_P (stmt_info) = true;
2979       return true;
2980     }
2981
2982   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2983     {
2984       /* First stmt in the interleaving chain. Check the chain.  */
2985       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2986       struct data_reference *data_ref = dr;
2987       unsigned int count = 1;
2988       tree prev_init = DR_INIT (data_ref);
2989       HOST_WIDE_INT diff, gaps = 0;
2990
2991       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2992       while (next)
2993         {
2994           /* We never have the same DR multiple times.  */
2995           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2996                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2997
2998           data_ref = STMT_VINFO_DATA_REF (next);
2999
3000           /* All group members have the same STEP by construction.  */
3001           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3002
3003           /* Check that the distance between two accesses is equal to the type
3004              size. Otherwise, we have gaps.  */
3005           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3006                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3007           if (diff < 1 || diff > UINT_MAX)
3008             {
3009               /* For artificial testcases with array accesses with large
3010                  constant indices we can run into overflow issues which
3011                  can end up fooling the groupsize constraint below so
3012                  check the individual gaps (which are represented as
3013                  unsigned int) as well.  */
3014               if (dump_enabled_p ())
3015                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3016                                  "interleaved access with gap larger "
3017                                  "than representable\n");
3018               return false;
3019             }
3020           if (diff != 1)
3021             {
3022               /* FORNOW: SLP of accesses with gaps is not supported.  */
3023               slp_impossible = true;
3024               if (DR_IS_WRITE (data_ref))
3025                 {
3026                   if (dump_enabled_p ())
3027                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3028                                      "interleaved store with gaps\n");
3029                   return false;
3030                 }
3031
3032               gaps += diff - 1;
3033             }
3034
3035           last_accessed_element += diff;
3036
3037           /* Store the gap from the previous member of the group. If there is no
3038              gap in the access, DR_GROUP_GAP is always 1.  */
3039           DR_GROUP_GAP (next) = diff;
3040
3041           prev_init = DR_INIT (data_ref);
3042           next = DR_GROUP_NEXT_ELEMENT (next);
3043           /* Count the number of data-refs in the chain.  */
3044           count++;
3045         }
3046
3047       if (groupsize == 0)
3048         groupsize = count + gaps;
3049
3050       /* This could be UINT_MAX but as we are generating code in a very
3051          inefficient way we have to cap earlier.  See PR78699 for example.  */
3052       if (groupsize > 4096)
3053         {
3054           if (dump_enabled_p ())
3055             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3056                              "group is too large\n");
3057           return false;
3058         }
3059
3060       /* Check that the size of the interleaving is equal to count for stores,
3061          i.e., that there are no gaps.  */
3062       if (groupsize != count
3063           && !DR_IS_READ (dr))
3064         {
3065           groupsize = count;
3066           STMT_VINFO_STRIDED_P (stmt_info) = true;
3067         }
3068
3069       /* If there is a gap after the last load in the group it is the
3070          difference between the groupsize and the last accessed
3071          element.
3072          When there is no gap, this difference should be 0.  */
3073       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3074
3075       DR_GROUP_SIZE (stmt_info) = groupsize;
3076       if (dump_enabled_p ())
3077         {
3078           dump_printf_loc (MSG_NOTE, vect_location,
3079                            "Detected interleaving ");
3080           if (DR_IS_READ (dr))
3081             dump_printf (MSG_NOTE, "load ");
3082           else if (STMT_VINFO_STRIDED_P (stmt_info))
3083             dump_printf (MSG_NOTE, "strided store ");
3084           else
3085             dump_printf (MSG_NOTE, "store ");
3086           dump_printf (MSG_NOTE, "of size %u\n",
3087                        (unsigned)groupsize);
3088           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3089           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3090           while (next)
3091             {
3092               if (DR_GROUP_GAP (next) != 1)
3093                 dump_printf_loc (MSG_NOTE, vect_location,
3094                                  "\t<gap of %d elements>\n",
3095                                  DR_GROUP_GAP (next) - 1);
3096               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3097               next = DR_GROUP_NEXT_ELEMENT (next);
3098             }
3099           if (DR_GROUP_GAP (stmt_info) != 0)
3100             dump_printf_loc (MSG_NOTE, vect_location,
3101                              "\t<gap of %d elements>\n",
3102                              DR_GROUP_GAP (stmt_info));
3103         }
3104
3105       /* SLP: create an SLP data structure for every interleaving group of
3106          stores for further analysis in vect_analyse_slp.  */
3107       if (DR_IS_WRITE (dr) && !slp_impossible)
3108         {
3109           if (loop_vinfo)
3110             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3111           if (bb_vinfo)
3112             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3113         }
3114     }
3115
3116   return true;
3117 }
3118
3119 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3120    accesses of legal size, step, etc.  Detect gaps, single element
3121    interleaving, and other special cases. Set grouped access info.
3122    Collect groups of strided stores for further use in SLP analysis.  */
3123
3124 static bool
3125 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3126 {
3127   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3128     {
3129       /* Dissolve the group if present.  */
3130       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3131       while (stmt_info)
3132         {
3133           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3134           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3135           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3136           stmt_info = next;
3137         }
3138       return false;
3139     }
3140   return true;
3141 }
3142
3143 /* Analyze the access pattern of the data-reference DR_INFO.
3144    In case of non-consecutive accesses call vect_analyze_group_access() to
3145    analyze groups of accesses.  */
3146
3147 static bool
3148 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3149 {
3150   data_reference *dr = dr_info->dr;
3151   tree step = DR_STEP (dr);
3152   tree scalar_type = TREE_TYPE (DR_REF (dr));
3153   stmt_vec_info stmt_info = dr_info->stmt;
3154   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3155   class loop *loop = NULL;
3156
3157   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3158     return true;
3159
3160   if (loop_vinfo)
3161     loop = LOOP_VINFO_LOOP (loop_vinfo);
3162
3163   if (loop_vinfo && !step)
3164     {
3165       if (dump_enabled_p ())
3166         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167                          "bad data-ref access in loop\n");
3168       return false;
3169     }
3170
3171   /* Allow loads with zero step in inner-loop vectorization.  */
3172   if (loop_vinfo && integer_zerop (step))
3173     {
3174       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3175       if (!nested_in_vect_loop_p (loop, stmt_info))
3176         return DR_IS_READ (dr);
3177       /* Allow references with zero step for outer loops marked
3178          with pragma omp simd only - it guarantees absence of
3179          loop-carried dependencies between inner loop iterations.  */
3180       if (loop->safelen < 2)
3181         {
3182           if (dump_enabled_p ())
3183             dump_printf_loc (MSG_NOTE, vect_location,
3184                              "zero step in inner loop of nest\n");
3185           return false;
3186         }
3187     }
3188
3189   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3190     {
3191       /* Interleaved accesses are not yet supported within outer-loop
3192         vectorization for references in the inner-loop.  */
3193       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3194
3195       /* For the rest of the analysis we use the outer-loop step.  */
3196       step = STMT_VINFO_DR_STEP (stmt_info);
3197       if (integer_zerop (step))
3198         {
3199           if (dump_enabled_p ())
3200             dump_printf_loc (MSG_NOTE, vect_location,
3201                              "zero step in outer loop.\n");
3202           return DR_IS_READ (dr);
3203         }
3204     }
3205
3206   /* Consecutive?  */
3207   if (TREE_CODE (step) == INTEGER_CST)
3208     {
3209       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3210       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3211           || (dr_step < 0
3212               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3213         {
3214           /* Mark that it is not interleaving.  */
3215           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3216           return true;
3217         }
3218     }
3219
3220   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3221     {
3222       if (dump_enabled_p ())
3223         dump_printf_loc (MSG_NOTE, vect_location,
3224                          "grouped access in outer loop.\n");
3225       return false;
3226     }
3227
3228
3229   /* Assume this is a DR handled by non-constant strided load case.  */
3230   if (TREE_CODE (step) != INTEGER_CST)
3231     return (STMT_VINFO_STRIDED_P (stmt_info)
3232             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3233                 || vect_analyze_group_access (vinfo, dr_info)));
3234
3235   /* Not consecutive access - check if it's a part of interleaving group.  */
3236   return vect_analyze_group_access (vinfo, dr_info);
3237 }
3238
3239 /* Compare two data-references DRA and DRB to group them into chunks
3240    suitable for grouping.  */
3241
3242 static int
3243 dr_group_sort_cmp (const void *dra_, const void *drb_)
3244 {
3245   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3246   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3247   data_reference_p dra = dra_info->dr;
3248   data_reference_p drb = drb_info->dr;
3249   int cmp;
3250
3251   /* Stabilize sort.  */
3252   if (dra == drb)
3253     return 0;
3254
3255   /* Different group IDs lead never belong to the same group.  */
3256   if (dra_info->group != drb_info->group)
3257     return dra_info->group < drb_info->group ? -1 : 1;
3258
3259   /* Ordering of DRs according to base.  */
3260   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3261                                DR_BASE_ADDRESS (drb));
3262   if (cmp != 0)
3263     return cmp;
3264
3265   /* And according to DR_OFFSET.  */
3266   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3267   if (cmp != 0)
3268     return cmp;
3269
3270   /* Put reads before writes.  */
3271   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3272     return DR_IS_READ (dra) ? -1 : 1;
3273
3274   /* Then sort after access size.  */
3275   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3276                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3277   if (cmp != 0)
3278     return cmp;
3279
3280   /* And after step.  */
3281   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3282   if (cmp != 0)
3283     return cmp;
3284
3285   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3286   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3287   if (cmp == 0)
3288     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3289   return cmp;
3290 }
3291
3292 /* If OP is the result of a conversion, return the unconverted value,
3293    otherwise return null.  */
3294
3295 static tree
3296 strip_conversion (tree op)
3297 {
3298   if (TREE_CODE (op) != SSA_NAME)
3299     return NULL_TREE;
3300   gimple *stmt = SSA_NAME_DEF_STMT (op);
3301   if (!is_gimple_assign (stmt)
3302       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3303     return NULL_TREE;
3304   return gimple_assign_rhs1 (stmt);
3305 }
3306
3307 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3308    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3309    be grouped in SLP mode.  */
3310
3311 static bool
3312 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3313                    bool allow_slp_p)
3314 {
3315   if (gimple_assign_single_p (stmt1_info->stmt))
3316     return gimple_assign_single_p (stmt2_info->stmt);
3317
3318   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3319   if (call1 && gimple_call_internal_p (call1))
3320     {
3321       /* Check for two masked loads or two masked stores.  */
3322       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3323       if (!call2 || !gimple_call_internal_p (call2))
3324         return false;
3325       internal_fn ifn = gimple_call_internal_fn (call1);
3326       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3327         return false;
3328       if (ifn != gimple_call_internal_fn (call2))
3329         return false;
3330
3331       /* Check that the masks are the same.  Cope with casts of masks,
3332          like those created by build_mask_conversion.  */
3333       tree mask1 = gimple_call_arg (call1, 2);
3334       tree mask2 = gimple_call_arg (call2, 2);
3335       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3336         {
3337           mask1 = strip_conversion (mask1);
3338           if (!mask1)
3339             return false;
3340           mask2 = strip_conversion (mask2);
3341           if (!mask2)
3342             return false;
3343           if (!operand_equal_p (mask1, mask2, 0))
3344             return false;
3345         }
3346       return true;
3347     }
3348
3349   return false;
3350 }
3351
3352 /* Function vect_analyze_data_ref_accesses.
3353
3354    Analyze the access pattern of all the data references in the loop.
3355
3356    FORNOW: the only access pattern that is considered vectorizable is a
3357            simple step 1 (consecutive) access.
3358
3359    FORNOW: handle only arrays and pointer accesses.  */
3360
3361 opt_result
3362 vect_analyze_data_ref_accesses (vec_info *vinfo,
3363                                 vec<int> *dataref_groups)
3364 {
3365   unsigned int i;
3366   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3367
3368   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3369
3370   if (datarefs.is_empty ())
3371     return opt_result::success ();
3372
3373   /* Sort the array of datarefs to make building the interleaving chains
3374      linear.  Don't modify the original vector's order, it is needed for
3375      determining what dependencies are reversed.  */
3376   vec<dr_vec_info *> datarefs_copy;
3377   datarefs_copy.create (datarefs.length ());
3378   for (unsigned i = 0; i < datarefs.length (); i++)
3379     {
3380       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3381       /* If the caller computed DR grouping use that, otherwise group by
3382          basic blocks.  */
3383       if (dataref_groups)
3384         dr_info->group = (*dataref_groups)[i];
3385       else
3386         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3387       datarefs_copy.quick_push (dr_info);
3388     }
3389   datarefs_copy.qsort (dr_group_sort_cmp);
3390   hash_set<stmt_vec_info> to_fixup;
3391
3392   /* Build the interleaving chains.  */
3393   for (i = 0; i < datarefs_copy.length () - 1;)
3394     {
3395       dr_vec_info *dr_info_a = datarefs_copy[i];
3396       data_reference_p dra = dr_info_a->dr;
3397       int dra_group_id = dr_info_a->group;
3398       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3399       stmt_vec_info lastinfo = NULL;
3400       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3401           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3402         {
3403           ++i;
3404           continue;
3405         }
3406       for (i = i + 1; i < datarefs_copy.length (); ++i)
3407         {
3408           dr_vec_info *dr_info_b = datarefs_copy[i];
3409           data_reference_p drb = dr_info_b->dr;
3410           int drb_group_id = dr_info_b->group;
3411           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3412           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3413               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3414             break;
3415
3416           /* ???  Imperfect sorting (non-compatible types, non-modulo
3417              accesses, same accesses) can lead to a group to be artificially
3418              split here as we don't just skip over those.  If it really
3419              matters we can push those to a worklist and re-iterate
3420              over them.  The we can just skip ahead to the next DR here.  */
3421
3422           /* DRs in a different DR group should not be put into the same
3423              interleaving group.  */
3424           if (dra_group_id != drb_group_id)
3425             break;
3426
3427           /* Check that the data-refs have same first location (except init)
3428              and they are both either store or load (not load and store,
3429              not masked loads or stores).  */
3430           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3431               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3432                                         DR_BASE_ADDRESS (drb)) != 0
3433               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3434               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3435             break;
3436
3437           /* Check that the data-refs have the same constant size.  */
3438           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3439           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3440           if (!tree_fits_uhwi_p (sza)
3441               || !tree_fits_uhwi_p (szb)
3442               || !tree_int_cst_equal (sza, szb))
3443             break;
3444
3445           /* Check that the data-refs have the same step.  */
3446           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3447             break;
3448
3449           /* Check the types are compatible.
3450              ???  We don't distinguish this during sorting.  */
3451           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3452                                    TREE_TYPE (DR_REF (drb))))
3453             break;
3454
3455           /* Check that the DR_INITs are compile-time constants.  */
3456           if (!tree_fits_shwi_p (DR_INIT (dra))
3457               || !tree_fits_shwi_p (DR_INIT (drb)))
3458             break;
3459
3460           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3461              just hold extra information.  */
3462           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3463               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3464               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3465             break;
3466
3467           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3468           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3469           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3470           HOST_WIDE_INT init_prev
3471             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3472           gcc_assert (init_a <= init_b
3473                       && init_a <= init_prev
3474                       && init_prev <= init_b);
3475
3476           /* Do not place the same access in the interleaving chain twice.  */
3477           if (init_b == init_prev)
3478             {
3479               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3480                           < gimple_uid (DR_STMT (drb)));
3481               /* Simply link in duplicates and fix up the chain below.  */
3482             }
3483           else
3484             {
3485               /* If init_b == init_a + the size of the type * k, we have an
3486                  interleaving, and DRA is accessed before DRB.  */
3487               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3488               if (type_size_a == 0
3489                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3490                       % type_size_a != 0))
3491                 break;
3492
3493               /* If we have a store, the accesses are adjacent.  This splits
3494                  groups into chunks we support (we don't support vectorization
3495                  of stores with gaps).  */
3496               if (!DR_IS_READ (dra)
3497                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3498                       != type_size_a))
3499                 break;
3500
3501               /* If the step (if not zero or non-constant) is smaller than the
3502                  difference between data-refs' inits this splits groups into
3503                  suitable sizes.  */
3504               if (tree_fits_shwi_p (DR_STEP (dra)))
3505                 {
3506                   unsigned HOST_WIDE_INT step
3507                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3508                   if (step != 0
3509                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3510                     break;
3511                 }
3512             }
3513
3514           if (dump_enabled_p ())
3515             dump_printf_loc (MSG_NOTE, vect_location,
3516                              DR_IS_READ (dra)
3517                              ? "Detected interleaving load %T and %T\n"
3518                              : "Detected interleaving store %T and %T\n",
3519                              DR_REF (dra), DR_REF (drb));
3520
3521           /* Link the found element into the group list.  */
3522           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3523             {
3524               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3525               lastinfo = stmtinfo_a;
3526             }
3527           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3528           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3529           lastinfo = stmtinfo_b;
3530
3531           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3532             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3533
3534           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3535             dump_printf_loc (MSG_NOTE, vect_location,
3536                              "Load suitable for SLP vectorization only.\n");
3537
3538           if (init_b == init_prev
3539               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3540               && dump_enabled_p ())
3541             dump_printf_loc (MSG_NOTE, vect_location,
3542                              "Queuing group with duplicate access for fixup\n");
3543         }
3544     }
3545
3546   /* Fixup groups with duplicate entries by splitting it.  */
3547   while (1)
3548     {
3549       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3550       if (!(it != to_fixup.end ()))
3551         break;
3552       stmt_vec_info grp = *it;
3553       to_fixup.remove (grp);
3554
3555       /* Find the earliest duplicate group member.  */
3556       unsigned first_duplicate = -1u;
3557       stmt_vec_info next, g = grp;
3558       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3559         {
3560           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3561                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3562               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3563             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3564           g = next;
3565         }
3566       if (first_duplicate == -1U)
3567         continue;
3568
3569       /* Then move all stmts after the first duplicate to a new group.
3570          Note this is a heuristic but one with the property that *it
3571          is fixed up completely.  */
3572       g = grp;
3573       stmt_vec_info newgroup = NULL, ng = grp;
3574       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3575         {
3576           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3577             {
3578               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3579               if (!newgroup)
3580                 newgroup = next;
3581               else
3582                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3583               ng = next;
3584               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3585             }
3586           else
3587             g = DR_GROUP_NEXT_ELEMENT (g);
3588         }
3589       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3590
3591       /* Fixup the new group which still may contain duplicates.  */
3592       to_fixup.add (newgroup);
3593     }
3594
3595   dr_vec_info *dr_info;
3596   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3597     {
3598       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3599           && !vect_analyze_data_ref_access (vinfo, dr_info))
3600         {
3601           if (dump_enabled_p ())
3602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603                              "not vectorized: complicated access pattern.\n");
3604
3605           if (is_a <bb_vec_info> (vinfo))
3606             {
3607               /* Mark the statement as not vectorizable.  */
3608               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3609               continue;
3610             }
3611           else
3612             {
3613               datarefs_copy.release ();
3614               return opt_result::failure_at (dr_info->stmt->stmt,
3615                                              "not vectorized:"
3616                                              " complicated access pattern.\n");
3617             }
3618         }
3619     }
3620
3621   datarefs_copy.release ();
3622   return opt_result::success ();
3623 }
3624
3625 /* Function vect_vfa_segment_size.
3626
3627    Input:
3628      DR_INFO: The data reference.
3629      LENGTH_FACTOR: segment length to consider.
3630
3631    Return a value suitable for the dr_with_seg_len::seg_len field.
3632    This is the "distance travelled" by the pointer from the first
3633    iteration in the segment to the last.  Note that it does not include
3634    the size of the access; in effect it only describes the first byte.  */
3635
3636 static tree
3637 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3638 {
3639   length_factor = size_binop (MINUS_EXPR,
3640                               fold_convert (sizetype, length_factor),
3641                               size_one_node);
3642   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3643                      length_factor);
3644 }
3645
3646 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3647    gives the worst-case number of bytes covered by the segment.  */
3648
3649 static unsigned HOST_WIDE_INT
3650 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3651 {
3652   stmt_vec_info stmt_vinfo = dr_info->stmt;
3653   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3654   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3655   unsigned HOST_WIDE_INT access_size = ref_size;
3656   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3657     {
3658       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3659       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3660     }
3661   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3662   int misalignment;
3663   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3664       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3665       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3666           == dr_explicit_realign_optimized))
3667     {
3668       /* We might access a full vector's worth.  */
3669       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3670     }
3671   return access_size;
3672 }
3673
3674 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3675    describes.  */
3676
3677 static unsigned int
3678 vect_vfa_align (dr_vec_info *dr_info)
3679 {
3680   return dr_alignment (dr_info->dr);
3681 }
3682
3683 /* Function vect_no_alias_p.
3684
3685    Given data references A and B with equal base and offset, see whether
3686    the alias relation can be decided at compilation time.  Return 1 if
3687    it can and the references alias, 0 if it can and the references do
3688    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3689    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3690    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3691
3692 static int
3693 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3694                          tree segment_length_a, tree segment_length_b,
3695                          unsigned HOST_WIDE_INT access_size_a,
3696                          unsigned HOST_WIDE_INT access_size_b)
3697 {
3698   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3699   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3700   poly_uint64 const_length_a;
3701   poly_uint64 const_length_b;
3702
3703   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3704      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3705      [a, a+12) */
3706   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3707     {
3708       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3709       offset_a -= const_length_a;
3710     }
3711   else
3712     const_length_a = tree_to_poly_uint64 (segment_length_a);
3713   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3714     {
3715       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3716       offset_b -= const_length_b;
3717     }
3718   else
3719     const_length_b = tree_to_poly_uint64 (segment_length_b);
3720
3721   const_length_a += access_size_a;
3722   const_length_b += access_size_b;
3723
3724   if (ranges_known_overlap_p (offset_a, const_length_a,
3725                               offset_b, const_length_b))
3726     return 1;
3727
3728   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3729                                offset_b, const_length_b))
3730     return 0;
3731
3732   return -1;
3733 }
3734
3735 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3736    in DDR is >= VF.  */
3737
3738 static bool
3739 dependence_distance_ge_vf (data_dependence_relation *ddr,
3740                            unsigned int loop_depth, poly_uint64 vf)
3741 {
3742   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3743       || DDR_NUM_DIST_VECTS (ddr) == 0)
3744     return false;
3745
3746   /* If the dependence is exact, we should have limited the VF instead.  */
3747   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3748
3749   unsigned int i;
3750   lambda_vector dist_v;
3751   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3752     {
3753       HOST_WIDE_INT dist = dist_v[loop_depth];
3754       if (dist != 0
3755           && !(dist > 0 && DDR_REVERSED_P (ddr))
3756           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3757         return false;
3758     }
3759
3760   if (dump_enabled_p ())
3761     dump_printf_loc (MSG_NOTE, vect_location,
3762                      "dependence distance between %T and %T is >= VF\n",
3763                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3764
3765   return true;
3766 }
3767
3768 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3769
3770 static void
3771 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3772 {
3773   dump_printf (dump_kind, "%s (%T) >= ",
3774                lower_bound.unsigned_p ? "unsigned" : "abs",
3775                lower_bound.expr);
3776   dump_dec (dump_kind, lower_bound.min_value);
3777 }
3778
3779 /* Record that the vectorized loop requires the vec_lower_bound described
3780    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3781
3782 static void
3783 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3784                         poly_uint64 min_value)
3785 {
3786   vec<vec_lower_bound> &lower_bounds
3787     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3788   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3789     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3790       {
3791         unsigned_p &= lower_bounds[i].unsigned_p;
3792         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3793         if (lower_bounds[i].unsigned_p != unsigned_p
3794             || maybe_lt (lower_bounds[i].min_value, min_value))
3795           {
3796             lower_bounds[i].unsigned_p = unsigned_p;
3797             lower_bounds[i].min_value = min_value;
3798             if (dump_enabled_p ())
3799               {
3800                 dump_printf_loc (MSG_NOTE, vect_location,
3801                                  "updating run-time check to ");
3802                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3803                 dump_printf (MSG_NOTE, "\n");
3804               }
3805           }
3806         return;
3807       }
3808
3809   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3810   if (dump_enabled_p ())
3811     {
3812       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3813       dump_lower_bound (MSG_NOTE, lower_bound);
3814       dump_printf (MSG_NOTE, "\n");
3815     }
3816   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3817 }
3818
3819 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3820    will span fewer than GAP bytes.  */
3821
3822 static bool
3823 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3824                   poly_int64 gap)
3825 {
3826   stmt_vec_info stmt_info = dr_info->stmt;
3827   HOST_WIDE_INT count
3828     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3829   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3830     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3831   return (estimated_poly_value (gap)
3832           <= count * vect_get_scalar_dr_size (dr_info));
3833 }
3834
3835 /* Return true if we know that there is no alias between DR_INFO_A and
3836    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3837    When returning true, set *LOWER_BOUND_OUT to this N.  */
3838
3839 static bool
3840 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3841                                 poly_uint64 *lower_bound_out)
3842 {
3843   /* Check that there is a constant gap of known sign between DR_A
3844      and DR_B.  */
3845   data_reference *dr_a = dr_info_a->dr;
3846   data_reference *dr_b = dr_info_b->dr;
3847   poly_int64 init_a, init_b;
3848   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3849       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3850       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3851       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3852       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3853       || !ordered_p (init_a, init_b))
3854     return false;
3855
3856   /* Sort DR_A and DR_B by the address they access.  */
3857   if (maybe_lt (init_b, init_a))
3858     {
3859       std::swap (init_a, init_b);
3860       std::swap (dr_info_a, dr_info_b);
3861       std::swap (dr_a, dr_b);
3862     }
3863
3864   /* If the two accesses could be dependent within a scalar iteration,
3865      make sure that we'd retain their order.  */
3866   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3867       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3868     return false;
3869
3870   /* There is no alias if abs (DR_STEP) is greater than or equal to
3871      the bytes spanned by the combination of the two accesses.  */
3872   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3873   return true;
3874 }
3875
3876 /* Function vect_prune_runtime_alias_test_list.
3877
3878    Prune a list of ddrs to be tested at run-time by versioning for alias.
3879    Merge several alias checks into one if possible.
3880    Return FALSE if resulting list of ddrs is longer then allowed by
3881    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3882
3883 opt_result
3884 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3885 {
3886   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3887   hash_set <tree_pair_hash> compared_objects;
3888
3889   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3890   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3891     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3892   const vec<vec_object_pair> &check_unequal_addrs
3893     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3894   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3895   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3896
3897   ddr_p ddr;
3898   unsigned int i;
3899   tree length_factor;
3900
3901   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3902
3903   /* Step values are irrelevant for aliasing if the number of vector
3904      iterations is equal to the number of scalar iterations (which can
3905      happen for fully-SLP loops).  */
3906   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3907
3908   if (!vf_one_p)
3909     {
3910       /* Convert the checks for nonzero steps into bound tests.  */
3911       tree value;
3912       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3913         vect_check_lower_bound (loop_vinfo, value, true, 1);
3914     }
3915
3916   if (may_alias_ddrs.is_empty ())
3917     return opt_result::success ();
3918
3919   comp_alias_ddrs.create (may_alias_ddrs.length ());
3920
3921   unsigned int loop_depth
3922     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3923                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3924
3925   /* First, we collect all data ref pairs for aliasing checks.  */
3926   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3927     {
3928       poly_uint64 lower_bound;
3929       tree segment_length_a, segment_length_b;
3930       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3931       unsigned int align_a, align_b;
3932
3933       /* Ignore the alias if the VF we chose ended up being no greater
3934          than the dependence distance.  */
3935       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3936         continue;
3937
3938       if (DDR_OBJECT_A (ddr))
3939         {
3940           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3941           if (!compared_objects.add (new_pair))
3942             {
3943               if (dump_enabled_p ())
3944                 dump_printf_loc (MSG_NOTE, vect_location,
3945                                  "checking that %T and %T"
3946                                  " have different addresses\n",
3947                                  new_pair.first, new_pair.second);
3948               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3949             }
3950           continue;
3951         }
3952
3953       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3954       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3955
3956       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3957       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3958
3959       bool preserves_scalar_order_p
3960         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3961       bool ignore_step_p
3962           = (vf_one_p
3963              && (preserves_scalar_order_p
3964                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3965                                      DR_STEP (dr_info_b->dr))));
3966
3967       /* Skip the pair if inter-iteration dependencies are irrelevant
3968          and intra-iteration dependencies are guaranteed to be honored.  */
3969       if (ignore_step_p
3970           && (preserves_scalar_order_p
3971               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3972                                                  &lower_bound)))
3973         {
3974           if (dump_enabled_p ())
3975             dump_printf_loc (MSG_NOTE, vect_location,
3976                              "no need for alias check between "
3977                              "%T and %T when VF is 1\n",
3978                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3979           continue;
3980         }
3981
3982       /* See whether we can handle the alias using a bounds check on
3983          the step, and whether that's likely to be the best approach.
3984          (It might not be, for example, if the minimum step is much larger
3985          than the number of bytes handled by one vector iteration.)  */
3986       if (!ignore_step_p
3987           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3988           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3989                                              &lower_bound)
3990           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3991               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3992         {
3993           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3994           if (dump_enabled_p ())
3995             {
3996               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3997                                "%T and %T when the step %T is outside ",
3998                                DR_REF (dr_info_a->dr),
3999                                DR_REF (dr_info_b->dr),
4000                                DR_STEP (dr_info_a->dr));
4001               if (unsigned_p)
4002                 dump_printf (MSG_NOTE, "[0");
4003               else
4004                 {
4005                   dump_printf (MSG_NOTE, "(");
4006                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4007                 }
4008               dump_printf (MSG_NOTE, ", ");
4009               dump_dec (MSG_NOTE, lower_bound);
4010               dump_printf (MSG_NOTE, ")\n");
4011             }
4012           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4013                                   unsigned_p, lower_bound);
4014           continue;
4015         }
4016
4017       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4018       if (dr_group_first_a)
4019         {
4020           stmt_info_a = dr_group_first_a;
4021           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4022         }
4023
4024       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4025       if (dr_group_first_b)
4026         {
4027           stmt_info_b = dr_group_first_b;
4028           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4029         }
4030
4031       if (ignore_step_p)
4032         {
4033           segment_length_a = size_zero_node;
4034           segment_length_b = size_zero_node;
4035         }
4036       else
4037         {
4038           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4039                                 DR_STEP (dr_info_b->dr), 0))
4040             length_factor = scalar_loop_iters;
4041           else
4042             length_factor = size_int (vect_factor);
4043           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4044           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4045         }
4046       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4047       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4048       align_a = vect_vfa_align (dr_info_a);
4049       align_b = vect_vfa_align (dr_info_b);
4050
4051       /* See whether the alias is known at compilation time.  */
4052       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4053                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4054           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4055                               DR_OFFSET (dr_info_b->dr), 0)
4056           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4057           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4058           && poly_int_tree_p (segment_length_a)
4059           && poly_int_tree_p (segment_length_b))
4060         {
4061           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4062                                              segment_length_a,
4063                                              segment_length_b,
4064                                              access_size_a,
4065                                              access_size_b);
4066           if (res >= 0 && dump_enabled_p ())
4067             {
4068               dump_printf_loc (MSG_NOTE, vect_location,
4069                                "can tell at compile time that %T and %T",
4070                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4071               if (res == 0)
4072                 dump_printf (MSG_NOTE, " do not alias\n");
4073               else
4074                 dump_printf (MSG_NOTE, " alias\n");
4075             }
4076
4077           if (res == 0)
4078             continue;
4079
4080           if (res == 1)
4081             return opt_result::failure_at (stmt_info_b->stmt,
4082                                            "not vectorized:"
4083                                            " compilation time alias: %G%G",
4084                                            stmt_info_a->stmt,
4085                                            stmt_info_b->stmt);
4086         }
4087
4088       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4089                             access_size_a, align_a);
4090       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4091                             access_size_b, align_b);
4092       /* Canonicalize the order to be the one that's needed for accurate
4093          RAW, WAR and WAW flags, in cases where the data references are
4094          well-ordered.  The order doesn't really matter otherwise,
4095          but we might as well be consistent.  */
4096       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4097         std::swap (dr_a, dr_b);
4098
4099       dr_with_seg_len_pair_t dr_with_seg_len_pair
4100         (dr_a, dr_b, (preserves_scalar_order_p
4101                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4102                       : dr_with_seg_len_pair_t::REORDERED));
4103
4104       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4105     }
4106
4107   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4108
4109   unsigned int count = (comp_alias_ddrs.length ()
4110                         + check_unequal_addrs.length ());
4111
4112   if (count
4113       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4114           == VECT_COST_MODEL_VERY_CHEAP))
4115     return opt_result::failure_at
4116       (vect_location, "would need a runtime alias check\n");
4117
4118   if (dump_enabled_p ())
4119     dump_printf_loc (MSG_NOTE, vect_location,
4120                      "improved number of alias checks from %d to %d\n",
4121                      may_alias_ddrs.length (), count);
4122   unsigned limit = param_vect_max_version_for_alias_checks;
4123   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4124     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4125   if (count > limit)
4126     return opt_result::failure_at
4127       (vect_location,
4128        "number of versioning for alias run-time tests exceeds %d "
4129        "(--param vect-max-version-for-alias-checks)\n", limit);
4130
4131   return opt_result::success ();
4132 }
4133
4134 /* Check whether we can use an internal function for a gather load
4135    or scatter store.  READ_P is true for loads and false for stores.
4136    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4137    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4138    is the type of the offset that is being applied to the invariant
4139    base address.  SCALE is the amount by which the offset should
4140    be multiplied *after* it has been converted to address width.
4141
4142    Return true if the function is supported, storing the function id in
4143    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4144
4145 bool
4146 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4147                           tree vectype, tree memory_type, tree offset_type,
4148                           int scale, internal_fn *ifn_out,
4149                           tree *offset_vectype_out)
4150 {
4151   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4152   unsigned int element_bits = vector_element_bits (vectype);
4153   if (element_bits != memory_bits)
4154     /* For now the vector elements must be the same width as the
4155        memory elements.  */
4156     return false;
4157
4158   /* Work out which function we need.  */
4159   internal_fn ifn, alt_ifn, alt_ifn2;
4160   if (read_p)
4161     {
4162       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4163       alt_ifn = IFN_MASK_GATHER_LOAD;
4164       /* When target supports MASK_LEN_GATHER_LOAD, we always
4165          use MASK_LEN_GATHER_LOAD regardless whether len and
4166          mask are valid or not.  */
4167       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4168     }
4169   else
4170     {
4171       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4172       alt_ifn = IFN_MASK_SCATTER_STORE;
4173       /* When target supports MASK_LEN_SCATTER_STORE, we always
4174          use MASK_LEN_SCATTER_STORE regardless whether len and
4175          mask are valid or not.  */
4176       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4177     }
4178
4179   for (;;)
4180     {
4181       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4182       if (!offset_vectype)
4183         return false;
4184
4185       /* Test whether the target supports this combination.  */
4186       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4187                                                   offset_vectype, scale))
4188         {
4189           *ifn_out = ifn;
4190           *offset_vectype_out = offset_vectype;
4191           return true;
4192         }
4193       else if (!masked_p
4194                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4195                                                           memory_type,
4196                                                           offset_vectype,
4197                                                           scale))
4198         {
4199           *ifn_out = alt_ifn;
4200           *offset_vectype_out = offset_vectype;
4201           return true;
4202         }
4203       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4204                                                        memory_type,
4205                                                        offset_vectype, scale))
4206         {
4207           *ifn_out = alt_ifn2;
4208           *offset_vectype_out = offset_vectype;
4209           return true;
4210         }
4211
4212       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4213           && TYPE_PRECISION (offset_type) >= element_bits)
4214         return false;
4215
4216       offset_type = build_nonstandard_integer_type
4217         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4218     }
4219 }
4220
4221 /* STMT_INFO is a call to an internal gather load or scatter store function.
4222    Describe the operation in INFO.  */
4223
4224 static void
4225 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4226                                    gather_scatter_info *info)
4227 {
4228   gcall *call = as_a <gcall *> (stmt_info->stmt);
4229   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4230   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4231
4232   info->ifn = gimple_call_internal_fn (call);
4233   info->decl = NULL_TREE;
4234   info->base = gimple_call_arg (call, 0);
4235   info->offset = gimple_call_arg (call, 1);
4236   info->offset_dt = vect_unknown_def_type;
4237   info->offset_vectype = NULL_TREE;
4238   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4239   info->element_type = TREE_TYPE (vectype);
4240   info->memory_type = TREE_TYPE (DR_REF (dr));
4241 }
4242
4243 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4244    gather load or scatter store.  Describe the operation in *INFO if so.  */
4245
4246 bool
4247 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4248                            gather_scatter_info *info)
4249 {
4250   HOST_WIDE_INT scale = 1;
4251   poly_int64 pbitpos, pbitsize;
4252   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4253   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4254   tree offtype = NULL_TREE;
4255   tree decl = NULL_TREE, base, off;
4256   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4257   tree memory_type = TREE_TYPE (DR_REF (dr));
4258   machine_mode pmode;
4259   int punsignedp, reversep, pvolatilep = 0;
4260   internal_fn ifn;
4261   tree offset_vectype;
4262   bool masked_p = false;
4263
4264   /* See whether this is already a call to a gather/scatter internal function.
4265      If not, see whether it's a masked load or store.  */
4266   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4267   if (call && gimple_call_internal_p (call))
4268     {
4269       ifn = gimple_call_internal_fn (call);
4270       if (internal_gather_scatter_fn_p (ifn))
4271         {
4272           vect_describe_gather_scatter_call (stmt_info, info);
4273           return true;
4274         }
4275       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4276     }
4277
4278   /* True if we should aim to use internal functions rather than
4279      built-in functions.  */
4280   bool use_ifn_p = (DR_IS_READ (dr)
4281                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4282                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4283
4284   base = DR_REF (dr);
4285   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4286      see if we can use the def stmt of the address.  */
4287   if (masked_p
4288       && TREE_CODE (base) == MEM_REF
4289       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4290       && integer_zerop (TREE_OPERAND (base, 1))
4291       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4292     {
4293       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4294       if (is_gimple_assign (def_stmt)
4295           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4296         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4297     }
4298
4299   /* The gather and scatter builtins need address of the form
4300      loop_invariant + vector * {1, 2, 4, 8}
4301      or
4302      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4303      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4304      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4305      multiplications and additions in it.  To get a vector, we need
4306      a single SSA_NAME that will be defined in the loop and will
4307      contain everything that is not loop invariant and that can be
4308      vectorized.  The following code attempts to find such a preexistng
4309      SSA_NAME OFF and put the loop invariants into a tree BASE
4310      that can be gimplified before the loop.  */
4311   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4312                               &punsignedp, &reversep, &pvolatilep);
4313   if (reversep)
4314     return false;
4315
4316   /* PR 107346.  Packed structs can have fields at offsets that are not
4317      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4318   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4319     return false;
4320
4321   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4322
4323   if (TREE_CODE (base) == MEM_REF)
4324     {
4325       if (!integer_zerop (TREE_OPERAND (base, 1)))
4326         {
4327           if (off == NULL_TREE)
4328             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4329           else
4330             off = size_binop (PLUS_EXPR, off,
4331                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4332         }
4333       base = TREE_OPERAND (base, 0);
4334     }
4335   else
4336     base = build_fold_addr_expr (base);
4337
4338   if (off == NULL_TREE)
4339     off = size_zero_node;
4340
4341   /* If base is not loop invariant, either off is 0, then we start with just
4342      the constant offset in the loop invariant BASE and continue with base
4343      as OFF, otherwise give up.
4344      We could handle that case by gimplifying the addition of base + off
4345      into some SSA_NAME and use that as off, but for now punt.  */
4346   if (!expr_invariant_in_loop_p (loop, base))
4347     {
4348       if (!integer_zerop (off))
4349         return false;
4350       off = base;
4351       base = size_int (pbytepos);
4352     }
4353   /* Otherwise put base + constant offset into the loop invariant BASE
4354      and continue with OFF.  */
4355   else
4356     {
4357       base = fold_convert (sizetype, base);
4358       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4359     }
4360
4361   /* OFF at this point may be either a SSA_NAME or some tree expression
4362      from get_inner_reference.  Try to peel off loop invariants from it
4363      into BASE as long as possible.  */
4364   STRIP_NOPS (off);
4365   while (offtype == NULL_TREE)
4366     {
4367       enum tree_code code;
4368       tree op0, op1, add = NULL_TREE;
4369
4370       if (TREE_CODE (off) == SSA_NAME)
4371         {
4372           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4373
4374           if (expr_invariant_in_loop_p (loop, off))
4375             return false;
4376
4377           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4378             break;
4379
4380           op0 = gimple_assign_rhs1 (def_stmt);
4381           code = gimple_assign_rhs_code (def_stmt);
4382           op1 = gimple_assign_rhs2 (def_stmt);
4383         }
4384       else
4385         {
4386           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4387             return false;
4388           code = TREE_CODE (off);
4389           extract_ops_from_tree (off, &code, &op0, &op1);
4390         }
4391       switch (code)
4392         {
4393         case POINTER_PLUS_EXPR:
4394         case PLUS_EXPR:
4395           if (expr_invariant_in_loop_p (loop, op0))
4396             {
4397               add = op0;
4398               off = op1;
4399             do_add:
4400               add = fold_convert (sizetype, add);
4401               if (scale != 1)
4402                 add = size_binop (MULT_EXPR, add, size_int (scale));
4403               base = size_binop (PLUS_EXPR, base, add);
4404               continue;
4405             }
4406           if (expr_invariant_in_loop_p (loop, op1))
4407             {
4408               add = op1;
4409               off = op0;
4410               goto do_add;
4411             }
4412           break;
4413         case MINUS_EXPR:
4414           if (expr_invariant_in_loop_p (loop, op1))
4415             {
4416               add = fold_convert (sizetype, op1);
4417               add = size_binop (MINUS_EXPR, size_zero_node, add);
4418               off = op0;
4419               goto do_add;
4420             }
4421           break;
4422         case MULT_EXPR:
4423           if (scale == 1 && tree_fits_shwi_p (op1))
4424             {
4425               int new_scale = tree_to_shwi (op1);
4426               /* Only treat this as a scaling operation if the target
4427                  supports it for at least some offset type.  */
4428               if (use_ifn_p
4429                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4430                                                 masked_p, vectype, memory_type,
4431                                                 signed_char_type_node,
4432                                                 new_scale, &ifn,
4433                                                 &offset_vectype)
4434                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4435                                                 masked_p, vectype, memory_type,
4436                                                 unsigned_char_type_node,
4437                                                 new_scale, &ifn,
4438                                                 &offset_vectype))
4439                 break;
4440               scale = new_scale;
4441               off = op0;
4442               continue;
4443             }
4444           break;
4445         case SSA_NAME:
4446           off = op0;
4447           continue;
4448         CASE_CONVERT:
4449           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4450               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4451             break;
4452
4453           /* Don't include the conversion if the target is happy with
4454              the current offset type.  */
4455           if (use_ifn_p
4456               && TREE_CODE (off) == SSA_NAME
4457               && !POINTER_TYPE_P (TREE_TYPE (off))
4458               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4459                                            masked_p, vectype, memory_type,
4460                                            TREE_TYPE (off), scale, &ifn,
4461                                            &offset_vectype))
4462             break;
4463
4464           if (TYPE_PRECISION (TREE_TYPE (op0))
4465               == TYPE_PRECISION (TREE_TYPE (off)))
4466             {
4467               off = op0;
4468               continue;
4469             }
4470
4471           /* Include the conversion if it is widening and we're using
4472              the IFN path or the target can handle the converted from
4473              offset or the current size is not already the same as the
4474              data vector element size.  */
4475           if ((TYPE_PRECISION (TREE_TYPE (op0))
4476                < TYPE_PRECISION (TREE_TYPE (off)))
4477               && (use_ifn_p
4478                   || (DR_IS_READ (dr)
4479                       ? (targetm.vectorize.builtin_gather
4480                          && targetm.vectorize.builtin_gather (vectype,
4481                                                               TREE_TYPE (op0),
4482                                                               scale))
4483                       : (targetm.vectorize.builtin_scatter
4484                          && targetm.vectorize.builtin_scatter (vectype,
4485                                                                TREE_TYPE (op0),
4486                                                                scale)))
4487                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4488                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4489             {
4490               off = op0;
4491               offtype = TREE_TYPE (off);
4492               STRIP_NOPS (off);
4493               continue;
4494             }
4495           break;
4496         default:
4497           break;
4498         }
4499       break;
4500     }
4501
4502   /* If at the end OFF still isn't a SSA_NAME or isn't
4503      defined in the loop, punt.  */
4504   if (TREE_CODE (off) != SSA_NAME
4505       || expr_invariant_in_loop_p (loop, off))
4506     return false;
4507
4508   if (offtype == NULL_TREE)
4509     offtype = TREE_TYPE (off);
4510
4511   if (use_ifn_p)
4512     {
4513       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4514                                      vectype, memory_type, offtype, scale,
4515                                      &ifn, &offset_vectype))
4516         ifn = IFN_LAST;
4517       decl = NULL_TREE;
4518     }
4519   else
4520     {
4521       if (DR_IS_READ (dr))
4522         {
4523           if (targetm.vectorize.builtin_gather)
4524             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4525         }
4526       else
4527         {
4528           if (targetm.vectorize.builtin_scatter)
4529             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4530         }
4531       ifn = IFN_LAST;
4532       /* The offset vector type will be read from DECL when needed.  */
4533       offset_vectype = NULL_TREE;
4534     }
4535
4536   info->ifn = ifn;
4537   info->decl = decl;
4538   info->base = base;
4539   info->offset = off;
4540   info->offset_dt = vect_unknown_def_type;
4541   info->offset_vectype = offset_vectype;
4542   info->scale = scale;
4543   info->element_type = TREE_TYPE (vectype);
4544   info->memory_type = memory_type;
4545   return true;
4546 }
4547
4548 /* Find the data references in STMT, analyze them with respect to LOOP and
4549    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4550    be handled.  */
4551
4552 opt_result
4553 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4554                                vec<data_reference_p> *datarefs,
4555                                vec<int> *dataref_groups, int group_id)
4556 {
4557   /* We can ignore clobbers for dataref analysis - they are removed during
4558      loop vectorization and BB vectorization checks dependences with a
4559      stmt walk.  */
4560   if (gimple_clobber_p (stmt))
4561     return opt_result::success ();
4562
4563   if (gimple_has_volatile_ops (stmt))
4564     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4565                                    stmt);
4566
4567   if (stmt_can_throw_internal (cfun, stmt))
4568     return opt_result::failure_at (stmt,
4569                                    "not vectorized:"
4570                                    " statement can throw an exception: %G",
4571                                    stmt);
4572
4573   auto_vec<data_reference_p, 2> refs;
4574   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4575   if (!res)
4576     return res;
4577
4578   if (refs.is_empty ())
4579     return opt_result::success ();
4580
4581   if (refs.length () > 1)
4582     {
4583       while (!refs.is_empty ())
4584         free_data_ref (refs.pop ());
4585       return opt_result::failure_at (stmt,
4586                                      "not vectorized: more than one "
4587                                      "data ref in stmt: %G", stmt);
4588     }
4589
4590   data_reference_p dr = refs.pop ();
4591   if (gcall *call = dyn_cast <gcall *> (stmt))
4592     if (!gimple_call_internal_p (call)
4593         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4594             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4595       {
4596         free_data_ref (dr);
4597         return opt_result::failure_at (stmt,
4598                                        "not vectorized: dr in a call %G", stmt);
4599       }
4600
4601   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4602       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4603     {
4604       free_data_ref (dr);
4605       return opt_result::failure_at (stmt,
4606                                      "not vectorized:"
4607                                      " statement is an unsupported"
4608                                      " bitfield access %G", stmt);
4609     }
4610
4611   if (DR_BASE_ADDRESS (dr)
4612       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4613     {
4614       free_data_ref (dr);
4615       return opt_result::failure_at (stmt,
4616                                      "not vectorized:"
4617                                      " base addr of dr is a constant\n");
4618     }
4619
4620   /* Check whether this may be a SIMD lane access and adjust the
4621      DR to make it easier for us to handle it.  */
4622   if (loop
4623       && loop->simduid
4624       && (!DR_BASE_ADDRESS (dr)
4625           || !DR_OFFSET (dr)
4626           || !DR_INIT (dr)
4627           || !DR_STEP (dr)))
4628     {
4629       struct data_reference *newdr
4630         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4631                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4632       if (DR_BASE_ADDRESS (newdr)
4633           && DR_OFFSET (newdr)
4634           && DR_INIT (newdr)
4635           && DR_STEP (newdr)
4636           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4637           && integer_zerop (DR_STEP (newdr)))
4638         {
4639           tree base_address = DR_BASE_ADDRESS (newdr);
4640           tree off = DR_OFFSET (newdr);
4641           tree step = ssize_int (1);
4642           if (integer_zerop (off)
4643               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4644             {
4645               off = TREE_OPERAND (base_address, 1);
4646               base_address = TREE_OPERAND (base_address, 0);
4647             }
4648           STRIP_NOPS (off);
4649           if (TREE_CODE (off) == MULT_EXPR
4650               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4651             {
4652               step = TREE_OPERAND (off, 1);
4653               off = TREE_OPERAND (off, 0);
4654               STRIP_NOPS (off);
4655             }
4656           if (CONVERT_EXPR_P (off)
4657               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4658                   < TYPE_PRECISION (TREE_TYPE (off))))
4659             off = TREE_OPERAND (off, 0);
4660           if (TREE_CODE (off) == SSA_NAME)
4661             {
4662               gimple *def = SSA_NAME_DEF_STMT (off);
4663               /* Look through widening conversion.  */
4664               if (is_gimple_assign (def)
4665                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4666                 {
4667                   tree rhs1 = gimple_assign_rhs1 (def);
4668                   if (TREE_CODE (rhs1) == SSA_NAME
4669                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4670                       && (TYPE_PRECISION (TREE_TYPE (off))
4671                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4672                     def = SSA_NAME_DEF_STMT (rhs1);
4673                 }
4674               if (is_gimple_call (def)
4675                   && gimple_call_internal_p (def)
4676                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4677                 {
4678                   tree arg = gimple_call_arg (def, 0);
4679                   tree reft = TREE_TYPE (DR_REF (newdr));
4680                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4681                   arg = SSA_NAME_VAR (arg);
4682                   if (arg == loop->simduid
4683                       /* For now.  */
4684                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4685                     {
4686                       DR_BASE_ADDRESS (newdr) = base_address;
4687                       DR_OFFSET (newdr) = ssize_int (0);
4688                       DR_STEP (newdr) = step;
4689                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4690                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4691                       /* Mark as simd-lane access.  */
4692                       tree arg2 = gimple_call_arg (def, 1);
4693                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4694                       free_data_ref (dr);
4695                       datarefs->safe_push (newdr);
4696                       if (dataref_groups)
4697                         dataref_groups->safe_push (group_id);
4698                       return opt_result::success ();
4699                     }
4700                 }
4701             }
4702         }
4703       free_data_ref (newdr);
4704     }
4705
4706   datarefs->safe_push (dr);
4707   if (dataref_groups)
4708     dataref_groups->safe_push (group_id);
4709   return opt_result::success ();
4710 }
4711
4712 /* Function vect_analyze_data_refs.
4713
4714   Find all the data references in the loop or basic block.
4715
4716    The general structure of the analysis of data refs in the vectorizer is as
4717    follows:
4718    1- vect_analyze_data_refs(loop/bb): call
4719       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4720       in the loop/bb and their dependences.
4721    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4722    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4723    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4724
4725 */
4726
4727 opt_result
4728 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4729 {
4730   class loop *loop = NULL;
4731   unsigned int i;
4732   struct data_reference *dr;
4733   tree scalar_type;
4734
4735   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4736
4737   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4738     loop = LOOP_VINFO_LOOP (loop_vinfo);
4739
4740   /* Go through the data-refs, check that the analysis succeeded.  Update
4741      pointer from stmt_vec_info struct to DR and vectype.  */
4742
4743   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4744   FOR_EACH_VEC_ELT (datarefs, i, dr)
4745     {
4746       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4747       poly_uint64 vf;
4748
4749       gcc_assert (DR_REF (dr));
4750       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4751       gcc_assert (!stmt_info->dr_aux.dr);
4752       stmt_info->dr_aux.dr = dr;
4753       stmt_info->dr_aux.stmt = stmt_info;
4754
4755       /* Check that analysis of the data-ref succeeded.  */
4756       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4757           || !DR_STEP (dr))
4758         {
4759           bool maybe_gather
4760             = DR_IS_READ (dr)
4761               && !TREE_THIS_VOLATILE (DR_REF (dr));
4762           bool maybe_scatter
4763             = DR_IS_WRITE (dr)
4764               && !TREE_THIS_VOLATILE (DR_REF (dr));
4765
4766           /* If target supports vector gather loads or scatter stores,
4767              see if they can't be used.  */
4768           if (is_a <loop_vec_info> (vinfo)
4769               && !nested_in_vect_loop_p (loop, stmt_info))
4770             {
4771               if (maybe_gather || maybe_scatter)
4772                 {
4773                   if (maybe_gather)
4774                     gatherscatter = GATHER;
4775                   else
4776                     gatherscatter = SCATTER;
4777                 }
4778             }
4779
4780           if (gatherscatter == SG_NONE)
4781             {
4782               if (dump_enabled_p ())
4783                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4784                                  "not vectorized: data ref analysis "
4785                                  "failed %G", stmt_info->stmt);
4786               if (is_a <bb_vec_info> (vinfo))
4787                 {
4788                   /* In BB vectorization the ref can still participate
4789                      in dependence analysis, we just can't vectorize it.  */
4790                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4791                   continue;
4792                 }
4793               return opt_result::failure_at (stmt_info->stmt,
4794                                              "not vectorized:"
4795                                              " data ref analysis failed: %G",
4796                                              stmt_info->stmt);
4797             }
4798         }
4799
4800       /* See if this was detected as SIMD lane access.  */
4801       if (dr->aux == (void *)-1
4802           || dr->aux == (void *)-2
4803           || dr->aux == (void *)-3
4804           || dr->aux == (void *)-4)
4805         {
4806           if (nested_in_vect_loop_p (loop, stmt_info))
4807             return opt_result::failure_at (stmt_info->stmt,
4808                                            "not vectorized:"
4809                                            " data ref analysis failed: %G",
4810                                            stmt_info->stmt);
4811           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4812             = -(uintptr_t) dr->aux;
4813         }
4814
4815       tree base = get_base_address (DR_REF (dr));
4816       if (base && VAR_P (base) && DECL_NONALIASED (base))
4817         {
4818           if (dump_enabled_p ())
4819             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4820                              "not vectorized: base object not addressable "
4821                              "for stmt: %G", stmt_info->stmt);
4822           if (is_a <bb_vec_info> (vinfo))
4823             {
4824               /* In BB vectorization the ref can still participate
4825                  in dependence analysis, we just can't vectorize it.  */
4826               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4827               continue;
4828             }
4829           return opt_result::failure_at (stmt_info->stmt,
4830                                          "not vectorized: base object not"
4831                                          " addressable for stmt: %G",
4832                                          stmt_info->stmt);
4833         }
4834
4835       if (is_a <loop_vec_info> (vinfo)
4836           && DR_STEP (dr)
4837           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4838         {
4839           if (nested_in_vect_loop_p (loop, stmt_info))
4840             return opt_result::failure_at (stmt_info->stmt,
4841                                            "not vectorized: "
4842                                            "not suitable for strided load %G",
4843                                            stmt_info->stmt);
4844           STMT_VINFO_STRIDED_P (stmt_info) = true;
4845         }
4846
4847       /* Update DR field in stmt_vec_info struct.  */
4848
4849       /* If the dataref is in an inner-loop of the loop that is considered for
4850          for vectorization, we also want to analyze the access relative to
4851          the outer-loop (DR contains information only relative to the
4852          inner-most enclosing loop).  We do that by building a reference to the
4853          first location accessed by the inner-loop, and analyze it relative to
4854          the outer-loop.  */
4855       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4856         {
4857           /* Build a reference to the first location accessed by the
4858              inner loop: *(BASE + INIT + OFFSET).  By construction,
4859              this address must be invariant in the inner loop, so we
4860              can consider it as being used in the outer loop.  */
4861           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4862           tree offset = unshare_expr (DR_OFFSET (dr));
4863           tree init = unshare_expr (DR_INIT (dr));
4864           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4865                                           init, offset);
4866           tree init_addr = fold_build_pointer_plus (base, init_offset);
4867           tree init_ref = build_fold_indirect_ref (init_addr);
4868
4869           if (dump_enabled_p ())
4870             dump_printf_loc (MSG_NOTE, vect_location,
4871                              "analyze in outer loop: %T\n", init_ref);
4872
4873           opt_result res
4874             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4875                                     init_ref, loop, stmt_info->stmt);
4876           if (!res)
4877             /* dr_analyze_innermost already explained the failure.  */
4878             return res;
4879
4880           if (dump_enabled_p ())
4881             dump_printf_loc (MSG_NOTE, vect_location,
4882                              "\touter base_address: %T\n"
4883                              "\touter offset from base address: %T\n"
4884                              "\touter constant offset from base address: %T\n"
4885                              "\touter step: %T\n"
4886                              "\touter base alignment: %d\n\n"
4887                              "\touter base misalignment: %d\n"
4888                              "\touter offset alignment: %d\n"
4889                              "\touter step alignment: %d\n",
4890                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4891                              STMT_VINFO_DR_OFFSET (stmt_info),
4892                              STMT_VINFO_DR_INIT (stmt_info),
4893                              STMT_VINFO_DR_STEP (stmt_info),
4894                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4895                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4896                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4897                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4898         }
4899
4900       /* Set vectype for STMT.  */
4901       scalar_type = TREE_TYPE (DR_REF (dr));
4902       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4903       if (!vectype)
4904         {
4905           if (dump_enabled_p ())
4906             {
4907               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4908                                "not vectorized: no vectype for stmt: %G",
4909                                stmt_info->stmt);
4910               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4911               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4912                                  scalar_type);
4913               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4914             }
4915
4916           if (is_a <bb_vec_info> (vinfo))
4917             {
4918               /* No vector type is fine, the ref can still participate
4919                  in dependence analysis, we just can't vectorize it.  */
4920               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4921               continue;
4922             }
4923           if (fatal)
4924             *fatal = false;
4925           return opt_result::failure_at (stmt_info->stmt,
4926                                          "not vectorized:"
4927                                          " no vectype for stmt: %G"
4928                                          " scalar_type: %T\n",
4929                                          stmt_info->stmt, scalar_type);
4930         }
4931       else
4932         {
4933           if (dump_enabled_p ())
4934             dump_printf_loc (MSG_NOTE, vect_location,
4935                              "got vectype for stmt: %G%T\n",
4936                              stmt_info->stmt, vectype);
4937         }
4938
4939       /* Adjust the minimal vectorization factor according to the
4940          vector type.  */
4941       vf = TYPE_VECTOR_SUBPARTS (vectype);
4942       *min_vf = upper_bound (*min_vf, vf);
4943
4944       /* Leave the BB vectorizer to pick the vector type later, based on
4945          the final dataref group size and SLP node size.  */
4946       if (is_a <loop_vec_info> (vinfo))
4947         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4948
4949       if (gatherscatter != SG_NONE)
4950         {
4951           gather_scatter_info gs_info;
4952           if (!vect_check_gather_scatter (stmt_info,
4953                                           as_a <loop_vec_info> (vinfo),
4954                                           &gs_info)
4955               || !get_vectype_for_scalar_type (vinfo,
4956                                                TREE_TYPE (gs_info.offset)))
4957             {
4958               if (fatal)
4959                 *fatal = false;
4960               return opt_result::failure_at
4961                         (stmt_info->stmt,
4962                          (gatherscatter == GATHER)
4963                          ? "not vectorized: not suitable for gather load %G"
4964                          : "not vectorized: not suitable for scatter store %G",
4965                          stmt_info->stmt);
4966             }
4967           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4968         }
4969     }
4970
4971   /* We used to stop processing and prune the list here.  Verify we no
4972      longer need to.  */
4973   gcc_assert (i == datarefs.length ());
4974
4975   return opt_result::success ();
4976 }
4977
4978
4979 /* Function vect_get_new_vect_var.
4980
4981    Returns a name for a new variable.  The current naming scheme appends the
4982    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4983    the name of vectorizer generated variables, and appends that to NAME if
4984    provided.  */
4985
4986 tree
4987 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4988 {
4989   const char *prefix;
4990   tree new_vect_var;
4991
4992   switch (var_kind)
4993   {
4994   case vect_simple_var:
4995     prefix = "vect";
4996     break;
4997   case vect_scalar_var:
4998     prefix = "stmp";
4999     break;
5000   case vect_mask_var:
5001     prefix = "mask";
5002     break;
5003   case vect_pointer_var:
5004     prefix = "vectp";
5005     break;
5006   default:
5007     gcc_unreachable ();
5008   }
5009
5010   if (name)
5011     {
5012       char* tmp = concat (prefix, "_", name, NULL);
5013       new_vect_var = create_tmp_reg (type, tmp);
5014       free (tmp);
5015     }
5016   else
5017     new_vect_var = create_tmp_reg (type, prefix);
5018
5019   return new_vect_var;
5020 }
5021
5022 /* Like vect_get_new_vect_var but return an SSA name.  */
5023
5024 tree
5025 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5026 {
5027   const char *prefix;
5028   tree new_vect_var;
5029
5030   switch (var_kind)
5031   {
5032   case vect_simple_var:
5033     prefix = "vect";
5034     break;
5035   case vect_scalar_var:
5036     prefix = "stmp";
5037     break;
5038   case vect_pointer_var:
5039     prefix = "vectp";
5040     break;
5041   default:
5042     gcc_unreachable ();
5043   }
5044
5045   if (name)
5046     {
5047       char* tmp = concat (prefix, "_", name, NULL);
5048       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5049       free (tmp);
5050     }
5051   else
5052     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5053
5054   return new_vect_var;
5055 }
5056
5057 /* Duplicate points-to info on NAME from DR_INFO.  */
5058
5059 static void
5060 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5061 {
5062   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5063   /* DR_PTR_INFO is for a base SSA name, not including constant or
5064      variable offsets in the ref so its alignment info does not apply.  */
5065   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5066 }
5067
5068 /* Function vect_create_addr_base_for_vector_ref.
5069
5070    Create an expression that computes the address of the first memory location
5071    that will be accessed for a data reference.
5072
5073    Input:
5074    STMT_INFO: The statement containing the data reference.
5075    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5076    OFFSET: Optional. If supplied, it is be added to the initial address.
5077    LOOP:    Specify relative to which loop-nest should the address be computed.
5078             For example, when the dataref is in an inner-loop nested in an
5079             outer-loop that is now being vectorized, LOOP can be either the
5080             outer-loop, or the inner-loop.  The first memory location accessed
5081             by the following dataref ('in' points to short):
5082
5083                 for (i=0; i<N; i++)
5084                    for (j=0; j<M; j++)
5085                      s += in[i+j]
5086
5087             is as follows:
5088             if LOOP=i_loop:     &in             (relative to i_loop)
5089             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5090
5091    Output:
5092    1. Return an SSA_NAME whose value is the address of the memory location of
5093       the first vector of the data reference.
5094    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5095       these statement(s) which define the returned SSA_NAME.
5096
5097    FORNOW: We are only handling array accesses with step 1.  */
5098
5099 tree
5100 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5101                                       gimple_seq *new_stmt_list,
5102                                       tree offset)
5103 {
5104   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5105   struct data_reference *dr = dr_info->dr;
5106   const char *base_name;
5107   tree addr_base;
5108   tree dest;
5109   gimple_seq seq = NULL;
5110   tree vect_ptr_type;
5111   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5112   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5113
5114   tree data_ref_base = unshare_expr (drb->base_address);
5115   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5116   tree init = unshare_expr (drb->init);
5117
5118   if (loop_vinfo)
5119     base_name = get_name (data_ref_base);
5120   else
5121     {
5122       base_offset = ssize_int (0);
5123       init = ssize_int (0);
5124       base_name = get_name (DR_REF (dr));
5125     }
5126
5127   /* Create base_offset */
5128   base_offset = size_binop (PLUS_EXPR,
5129                             fold_convert (sizetype, base_offset),
5130                             fold_convert (sizetype, init));
5131
5132   if (offset)
5133     {
5134       offset = fold_convert (sizetype, offset);
5135       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5136                                  base_offset, offset);
5137     }
5138
5139   /* base + base_offset */
5140   if (loop_vinfo)
5141     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5142   else
5143     addr_base = build1 (ADDR_EXPR,
5144                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5145                         /* Strip zero offset components since we don't need
5146                            them and they can confuse late diagnostics if
5147                            we CSE them wrongly.  See PR106904 for example.  */
5148                         unshare_expr (strip_zero_offset_components
5149                                                                 (DR_REF (dr))));
5150
5151   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5152   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5153   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5154   gimple_seq_add_seq (new_stmt_list, seq);
5155
5156   if (DR_PTR_INFO (dr)
5157       && TREE_CODE (addr_base) == SSA_NAME
5158       /* We should only duplicate pointer info to newly created SSA names.  */
5159       && SSA_NAME_VAR (addr_base) == dest)
5160     {
5161       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5162       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5163     }
5164
5165   if (dump_enabled_p ())
5166     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5167
5168   return addr_base;
5169 }
5170
5171
5172 /* Function vect_create_data_ref_ptr.
5173
5174    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5175    location accessed in the loop by STMT_INFO, along with the def-use update
5176    chain to appropriately advance the pointer through the loop iterations.
5177    Also set aliasing information for the pointer.  This pointer is used by
5178    the callers to this function to create a memory reference expression for
5179    vector load/store access.
5180
5181    Input:
5182    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5183          GIMPLE_ASSIGN <name, data-ref> or
5184          GIMPLE_ASSIGN <data-ref, name>.
5185    2. AGGR_TYPE: the type of the reference, which should be either a vector
5186         or an array.
5187    3. AT_LOOP: the loop where the vector memref is to be created.
5188    4. OFFSET (optional): a byte offset to be added to the initial address
5189         accessed by the data-ref in STMT_INFO.
5190    5. BSI: location where the new stmts are to be placed if there is no loop
5191    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5192         pointing to the initial address.
5193    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5194         to the IV during each iteration of the loop.  NULL says to move
5195         by one copy of AGGR_TYPE up or down, depending on the step of the
5196         data reference.
5197
5198    Output:
5199    1. Declare a new ptr to vector_type, and have it point to the base of the
5200       data reference (initial addressed accessed by the data reference).
5201       For example, for vector of type V8HI, the following code is generated:
5202
5203       v8hi *ap;
5204       ap = (v8hi *)initial_address;
5205
5206       if OFFSET is not supplied:
5207          initial_address = &a[init];
5208       if OFFSET is supplied:
5209          initial_address = &a[init] + OFFSET;
5210       if BYTE_OFFSET is supplied:
5211          initial_address = &a[init] + BYTE_OFFSET;
5212
5213       Return the initial_address in INITIAL_ADDRESS.
5214
5215    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5216       update the pointer in each iteration of the loop.
5217
5218       Return the increment stmt that updates the pointer in PTR_INCR.
5219
5220    3. Return the pointer.  */
5221
5222 tree
5223 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5224                           tree aggr_type, class loop *at_loop, tree offset,
5225                           tree *initial_address, gimple_stmt_iterator *gsi,
5226                           gimple **ptr_incr, bool only_init,
5227                           tree iv_step)
5228 {
5229   const char *base_name;
5230   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5231   class loop *loop = NULL;
5232   bool nested_in_vect_loop = false;
5233   class loop *containing_loop = NULL;
5234   tree aggr_ptr_type;
5235   tree aggr_ptr;
5236   tree new_temp;
5237   gimple_seq new_stmt_list = NULL;
5238   edge pe = NULL;
5239   basic_block new_bb;
5240   tree aggr_ptr_init;
5241   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5242   struct data_reference *dr = dr_info->dr;
5243   tree aptr;
5244   gimple_stmt_iterator incr_gsi;
5245   bool insert_after;
5246   tree indx_before_incr, indx_after_incr;
5247   gimple *incr;
5248   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5249
5250   gcc_assert (iv_step != NULL_TREE
5251               || TREE_CODE (aggr_type) == ARRAY_TYPE
5252               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5253
5254   if (loop_vinfo)
5255     {
5256       loop = LOOP_VINFO_LOOP (loop_vinfo);
5257       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5258       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5259       pe = loop_preheader_edge (loop);
5260     }
5261   else
5262     {
5263       gcc_assert (bb_vinfo);
5264       only_init = true;
5265       *ptr_incr = NULL;
5266     }
5267
5268   /* Create an expression for the first address accessed by this load
5269      in LOOP.  */
5270   base_name = get_name (DR_BASE_ADDRESS (dr));
5271
5272   if (dump_enabled_p ())
5273     {
5274       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5275       dump_printf_loc (MSG_NOTE, vect_location,
5276                        "create %s-pointer variable to type: %T",
5277                        get_tree_code_name (TREE_CODE (aggr_type)),
5278                        aggr_type);
5279       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5280         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5281       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5282         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5283       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5284         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5285       else
5286         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5287       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5288     }
5289
5290   /* (1) Create the new aggregate-pointer variable.
5291      Vector and array types inherit the alias set of their component
5292      type by default so we need to use a ref-all pointer if the data
5293      reference does not conflict with the created aggregated data
5294      reference because it is not addressable.  */
5295   bool need_ref_all = false;
5296   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5297                               get_alias_set (DR_REF (dr))))
5298     need_ref_all = true;
5299   /* Likewise for any of the data references in the stmt group.  */
5300   else if (DR_GROUP_SIZE (stmt_info) > 1)
5301     {
5302       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5303       do
5304         {
5305           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5306           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5307                                       get_alias_set (DR_REF (sdr))))
5308             {
5309               need_ref_all = true;
5310               break;
5311             }
5312           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5313         }
5314       while (sinfo);
5315     }
5316   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5317                                                need_ref_all);
5318   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5319
5320
5321   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5322      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5323      def-use update cycles for the pointer: one relative to the outer-loop
5324      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5325      to the inner-loop (which is the inner-most loop containing the dataref),
5326      and this is done be step (5) below.
5327
5328      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5329      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5330      redundant.  Steps (3),(4) create the following:
5331
5332         vp0 = &base_addr;
5333         LOOP:   vp1 = phi(vp0,vp2)
5334                 ...
5335                 ...
5336                 vp2 = vp1 + step
5337                 goto LOOP
5338
5339      If there is an inner-loop nested in loop, then step (5) will also be
5340      applied, and an additional update in the inner-loop will be created:
5341
5342         vp0 = &base_addr;
5343         LOOP:   vp1 = phi(vp0,vp2)
5344                 ...
5345         inner:     vp3 = phi(vp1,vp4)
5346                    vp4 = vp3 + inner_step
5347                    if () goto inner
5348                 ...
5349                 vp2 = vp1 + step
5350                 if () goto LOOP   */
5351
5352   /* (2) Calculate the initial address of the aggregate-pointer, and set
5353      the aggregate-pointer to point to it before the loop.  */
5354
5355   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5356
5357   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5358                                                    stmt_info, &new_stmt_list,
5359                                                    offset);
5360   if (new_stmt_list)
5361     {
5362       if (pe)
5363         {
5364           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5365           gcc_assert (!new_bb);
5366         }
5367       else
5368         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5369     }
5370
5371   *initial_address = new_temp;
5372   aggr_ptr_init = new_temp;
5373
5374   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5375      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5376      inner-loop nested in LOOP (during outer-loop vectorization).  */
5377
5378   /* No update in loop is required.  */
5379   if (only_init && (!loop_vinfo || at_loop == loop))
5380     aptr = aggr_ptr_init;
5381   else
5382     {
5383       /* Accesses to invariant addresses should be handled specially
5384          by the caller.  */
5385       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5386       gcc_assert (!integer_zerop (step));
5387
5388       if (iv_step == NULL_TREE)
5389         {
5390           /* The step of the aggregate pointer is the type size,
5391              negated for downward accesses.  */
5392           iv_step = TYPE_SIZE_UNIT (aggr_type);
5393           if (tree_int_cst_sgn (step) == -1)
5394             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5395         }
5396
5397       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5398
5399       create_iv (aggr_ptr_init, PLUS_EXPR,
5400                  fold_convert (aggr_ptr_type, iv_step),
5401                  aggr_ptr, loop, &incr_gsi, insert_after,
5402                  &indx_before_incr, &indx_after_incr);
5403       incr = gsi_stmt (incr_gsi);
5404
5405       /* Copy the points-to information if it exists. */
5406       if (DR_PTR_INFO (dr))
5407         {
5408           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5409           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5410         }
5411       if (ptr_incr)
5412         *ptr_incr = incr;
5413
5414       aptr = indx_before_incr;
5415     }
5416
5417   if (!nested_in_vect_loop || only_init)
5418     return aptr;
5419
5420
5421   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5422      nested in LOOP, if exists.  */
5423
5424   gcc_assert (nested_in_vect_loop);
5425   if (!only_init)
5426     {
5427       standard_iv_increment_position (containing_loop, &incr_gsi,
5428                                       &insert_after);
5429       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5430                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5431                  &indx_before_incr, &indx_after_incr);
5432       incr = gsi_stmt (incr_gsi);
5433
5434       /* Copy the points-to information if it exists. */
5435       if (DR_PTR_INFO (dr))
5436         {
5437           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5438           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5439         }
5440       if (ptr_incr)
5441         *ptr_incr = incr;
5442
5443       return indx_before_incr;
5444     }
5445   else
5446     gcc_unreachable ();
5447 }
5448
5449
5450 /* Function bump_vector_ptr
5451
5452    Increment a pointer (to a vector type) by vector-size. If requested,
5453    i.e. if PTR-INCR is given, then also connect the new increment stmt
5454    to the existing def-use update-chain of the pointer, by modifying
5455    the PTR_INCR as illustrated below:
5456
5457    The pointer def-use update-chain before this function:
5458                         DATAREF_PTR = phi (p_0, p_2)
5459                         ....
5460         PTR_INCR:       p_2 = DATAREF_PTR + step
5461
5462    The pointer def-use update-chain after this function:
5463                         DATAREF_PTR = phi (p_0, p_2)
5464                         ....
5465                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5466                         ....
5467         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5468
5469    Input:
5470    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5471                  in the loop.
5472    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5473               the loop.  The increment amount across iterations is expected
5474               to be vector_size.
5475    BSI - location where the new update stmt is to be placed.
5476    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5477    BUMP - optional. The offset by which to bump the pointer. If not given,
5478           the offset is assumed to be vector_size.
5479
5480    Output: Return NEW_DATAREF_PTR as illustrated above.
5481
5482 */
5483
5484 tree
5485 bump_vector_ptr (vec_info *vinfo,
5486                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5487                  stmt_vec_info stmt_info, tree bump)
5488 {
5489   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5490   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5491   tree update = TYPE_SIZE_UNIT (vectype);
5492   gimple *incr_stmt;
5493   ssa_op_iter iter;
5494   use_operand_p use_p;
5495   tree new_dataref_ptr;
5496
5497   if (bump)
5498     update = bump;
5499
5500   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5501     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5502   else if (is_gimple_min_invariant (dataref_ptr))
5503     /* When possible avoid emitting a separate increment stmt that will
5504        force the addressed object addressable.  */
5505     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5506                    fold_build2 (MEM_REF,
5507                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5508                                 dataref_ptr,
5509                                 fold_convert (ptr_type_node, update)));
5510   else
5511     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5512   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5513                                    dataref_ptr, update);
5514   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5515   /* Fold the increment, avoiding excessive chains use-def chains of
5516      those, leading to compile-time issues for passes until the next
5517      forwprop pass which would do this as well.  */
5518   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5519   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5520     {
5521       incr_stmt = gsi_stmt (fold_gsi);
5522       update_stmt (incr_stmt);
5523     }
5524
5525   /* Copy the points-to information if it exists. */
5526   if (DR_PTR_INFO (dr))
5527     {
5528       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5529       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5530     }
5531
5532   if (!ptr_incr)
5533     return new_dataref_ptr;
5534
5535   /* Update the vector-pointer's cross-iteration increment.  */
5536   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5537     {
5538       tree use = USE_FROM_PTR (use_p);
5539
5540       if (use == dataref_ptr)
5541         SET_USE (use_p, new_dataref_ptr);
5542       else
5543         gcc_assert (operand_equal_p (use, update, 0));
5544     }
5545
5546   return new_dataref_ptr;
5547 }
5548
5549
5550 /* Copy memory reference info such as base/clique from the SRC reference
5551    to the DEST MEM_REF.  */
5552
5553 void
5554 vect_copy_ref_info (tree dest, tree src)
5555 {
5556   if (TREE_CODE (dest) != MEM_REF)
5557     return;
5558
5559   tree src_base = src;
5560   while (handled_component_p (src_base))
5561     src_base = TREE_OPERAND (src_base, 0);
5562   if (TREE_CODE (src_base) != MEM_REF
5563       && TREE_CODE (src_base) != TARGET_MEM_REF)
5564     return;
5565
5566   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5567   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5568 }
5569
5570
5571 /* Function vect_create_destination_var.
5572
5573    Create a new temporary of type VECTYPE.  */
5574
5575 tree
5576 vect_create_destination_var (tree scalar_dest, tree vectype)
5577 {
5578   tree vec_dest;
5579   const char *name;
5580   char *new_name;
5581   tree type;
5582   enum vect_var_kind kind;
5583
5584   kind = vectype
5585     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5586     ? vect_mask_var
5587     : vect_simple_var
5588     : vect_scalar_var;
5589   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5590
5591   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5592
5593   name = get_name (scalar_dest);
5594   if (name)
5595     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5596   else
5597     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5598   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5599   free (new_name);
5600
5601   return vec_dest;
5602 }
5603
5604 /* Function vect_grouped_store_supported.
5605
5606    Returns TRUE if interleave high and interleave low permutations
5607    are supported, and FALSE otherwise.  */
5608
5609 bool
5610 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5611 {
5612   machine_mode mode = TYPE_MODE (vectype);
5613
5614   /* vect_permute_store_chain requires the group size to be equal to 3 or
5615      be a power of two.  */
5616   if (count != 3 && exact_log2 (count) == -1)
5617     {
5618       if (dump_enabled_p ())
5619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5620                          "the size of the group of accesses"
5621                          " is not a power of 2 or not eqaul to 3\n");
5622       return false;
5623     }
5624
5625   /* Check that the permutation is supported.  */
5626   if (VECTOR_MODE_P (mode))
5627     {
5628       unsigned int i;
5629       if (count == 3)
5630         {
5631           unsigned int j0 = 0, j1 = 0, j2 = 0;
5632           unsigned int i, j;
5633
5634           unsigned int nelt;
5635           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5636             {
5637               if (dump_enabled_p ())
5638                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639                                  "cannot handle groups of 3 stores for"
5640                                  " variable-length vectors\n");
5641               return false;
5642             }
5643
5644           vec_perm_builder sel (nelt, nelt, 1);
5645           sel.quick_grow (nelt);
5646           vec_perm_indices indices;
5647           for (j = 0; j < 3; j++)
5648             {
5649               int nelt0 = ((3 - j) * nelt) % 3;
5650               int nelt1 = ((3 - j) * nelt + 1) % 3;
5651               int nelt2 = ((3 - j) * nelt + 2) % 3;
5652               for (i = 0; i < nelt; i++)
5653                 {
5654                   if (3 * i + nelt0 < nelt)
5655                     sel[3 * i + nelt0] = j0++;
5656                   if (3 * i + nelt1 < nelt)
5657                     sel[3 * i + nelt1] = nelt + j1++;
5658                   if (3 * i + nelt2 < nelt)
5659                     sel[3 * i + nelt2] = 0;
5660                 }
5661               indices.new_vector (sel, 2, nelt);
5662               if (!can_vec_perm_const_p (mode, mode, indices))
5663                 {
5664                   if (dump_enabled_p ())
5665                     dump_printf (MSG_MISSED_OPTIMIZATION,
5666                                  "permutation op not supported by target.\n");
5667                   return false;
5668                 }
5669
5670               for (i = 0; i < nelt; i++)
5671                 {
5672                   if (3 * i + nelt0 < nelt)
5673                     sel[3 * i + nelt0] = 3 * i + nelt0;
5674                   if (3 * i + nelt1 < nelt)
5675                     sel[3 * i + nelt1] = 3 * i + nelt1;
5676                   if (3 * i + nelt2 < nelt)
5677                     sel[3 * i + nelt2] = nelt + j2++;
5678                 }
5679               indices.new_vector (sel, 2, nelt);
5680               if (!can_vec_perm_const_p (mode, mode, indices))
5681                 {
5682                   if (dump_enabled_p ())
5683                     dump_printf (MSG_MISSED_OPTIMIZATION,
5684                                  "permutation op not supported by target.\n");
5685                   return false;
5686                 }
5687             }
5688           return true;
5689         }
5690       else
5691         {
5692           /* If length is not equal to 3 then only power of 2 is supported.  */
5693           gcc_assert (pow2p_hwi (count));
5694           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5695
5696           /* The encoding has 2 interleaved stepped patterns.  */
5697           if(!multiple_p (nelt, 2))
5698             return false;
5699           vec_perm_builder sel (nelt, 2, 3);
5700           sel.quick_grow (6);
5701           for (i = 0; i < 3; i++)
5702             {
5703               sel[i * 2] = i;
5704               sel[i * 2 + 1] = i + nelt;
5705             }
5706           vec_perm_indices indices (sel, 2, nelt);
5707           if (can_vec_perm_const_p (mode, mode, indices))
5708             {
5709               for (i = 0; i < 6; i++)
5710                 sel[i] += exact_div (nelt, 2);
5711               indices.new_vector (sel, 2, nelt);
5712               if (can_vec_perm_const_p (mode, mode, indices))
5713                 return true;
5714             }
5715         }
5716     }
5717
5718   if (dump_enabled_p ())
5719     dump_printf (MSG_MISSED_OPTIMIZATION,
5720                  "permutation op not supported by target.\n");
5721   return false;
5722 }
5723
5724 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5725    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5726
5727 internal_fn
5728 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5729                             bool masked_p)
5730 {
5731   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5732                                     vec_mask_len_store_lanes_optab, vectype,
5733                                     count))
5734     return IFN_MASK_LEN_STORE_LANES;
5735   else if (masked_p)
5736     {
5737       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5738                                         vec_mask_store_lanes_optab, vectype,
5739                                         count))
5740         return IFN_MASK_STORE_LANES;
5741     }
5742   else
5743     {
5744       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5745                                         vec_store_lanes_optab, vectype, count))
5746         return IFN_STORE_LANES;
5747     }
5748   return IFN_LAST;
5749 }
5750
5751
5752 /* Function vect_permute_store_chain.
5753
5754    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5755    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5756    the data correctly for the stores.  Return the final references for stores
5757    in RESULT_CHAIN.
5758
5759    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5760    The input is 4 vectors each containing 8 elements.  We assign a number to
5761    each element, the input sequence is:
5762
5763    1st vec:   0  1  2  3  4  5  6  7
5764    2nd vec:   8  9 10 11 12 13 14 15
5765    3rd vec:  16 17 18 19 20 21 22 23
5766    4th vec:  24 25 26 27 28 29 30 31
5767
5768    The output sequence should be:
5769
5770    1st vec:  0  8 16 24  1  9 17 25
5771    2nd vec:  2 10 18 26  3 11 19 27
5772    3rd vec:  4 12 20 28  5 13 21 30
5773    4th vec:  6 14 22 30  7 15 23 31
5774
5775    i.e., we interleave the contents of the four vectors in their order.
5776
5777    We use interleave_high/low instructions to create such output.  The input of
5778    each interleave_high/low operation is two vectors:
5779    1st vec    2nd vec
5780    0 1 2 3    4 5 6 7
5781    the even elements of the result vector are obtained left-to-right from the
5782    high/low elements of the first vector.  The odd elements of the result are
5783    obtained left-to-right from the high/low elements of the second vector.
5784    The output of interleave_high will be:   0 4 1 5
5785    and of interleave_low:                   2 6 3 7
5786
5787
5788    The permutation is done in log LENGTH stages.  In each stage interleave_high
5789    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5790    where the first argument is taken from the first half of DR_CHAIN and the
5791    second argument from it's second half.
5792    In our example,
5793
5794    I1: interleave_high (1st vec, 3rd vec)
5795    I2: interleave_low (1st vec, 3rd vec)
5796    I3: interleave_high (2nd vec, 4th vec)
5797    I4: interleave_low (2nd vec, 4th vec)
5798
5799    The output for the first stage is:
5800
5801    I1:  0 16  1 17  2 18  3 19
5802    I2:  4 20  5 21  6 22  7 23
5803    I3:  8 24  9 25 10 26 11 27
5804    I4: 12 28 13 29 14 30 15 31
5805
5806    The output of the second stage, i.e. the final result is:
5807
5808    I1:  0  8 16 24  1  9 17 25
5809    I2:  2 10 18 26  3 11 19 27
5810    I3:  4 12 20 28  5 13 21 30
5811    I4:  6 14 22 30  7 15 23 31.  */
5812
5813 void
5814 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5815                           unsigned int length,
5816                           stmt_vec_info stmt_info,
5817                           gimple_stmt_iterator *gsi,
5818                           vec<tree> *result_chain)
5819 {
5820   tree vect1, vect2, high, low;
5821   gimple *perm_stmt;
5822   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5823   tree perm_mask_low, perm_mask_high;
5824   tree data_ref;
5825   tree perm3_mask_low, perm3_mask_high;
5826   unsigned int i, j, n, log_length = exact_log2 (length);
5827
5828   result_chain->quick_grow (length);
5829   memcpy (result_chain->address (), dr_chain.address (),
5830           length * sizeof (tree));
5831
5832   if (length == 3)
5833     {
5834       /* vect_grouped_store_supported ensures that this is constant.  */
5835       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5836       unsigned int j0 = 0, j1 = 0, j2 = 0;
5837
5838       vec_perm_builder sel (nelt, nelt, 1);
5839       sel.quick_grow (nelt);
5840       vec_perm_indices indices;
5841       for (j = 0; j < 3; j++)
5842         {
5843           int nelt0 = ((3 - j) * nelt) % 3;
5844           int nelt1 = ((3 - j) * nelt + 1) % 3;
5845           int nelt2 = ((3 - j) * nelt + 2) % 3;
5846
5847           for (i = 0; i < nelt; i++)
5848             {
5849               if (3 * i + nelt0 < nelt)
5850                 sel[3 * i + nelt0] = j0++;
5851               if (3 * i + nelt1 < nelt)
5852                 sel[3 * i + nelt1] = nelt + j1++;
5853               if (3 * i + nelt2 < nelt)
5854                 sel[3 * i + nelt2] = 0;
5855             }
5856           indices.new_vector (sel, 2, nelt);
5857           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5858
5859           for (i = 0; i < nelt; i++)
5860             {
5861               if (3 * i + nelt0 < nelt)
5862                 sel[3 * i + nelt0] = 3 * i + nelt0;
5863               if (3 * i + nelt1 < nelt)
5864                 sel[3 * i + nelt1] = 3 * i + nelt1;
5865               if (3 * i + nelt2 < nelt)
5866                 sel[3 * i + nelt2] = nelt + j2++;
5867             }
5868           indices.new_vector (sel, 2, nelt);
5869           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5870
5871           vect1 = dr_chain[0];
5872           vect2 = dr_chain[1];
5873
5874           /* Create interleaving stmt:
5875              low = VEC_PERM_EXPR <vect1, vect2,
5876                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5877                                    j + 2, nelt + j + 2, *, ...}>  */
5878           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5879           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5880                                            vect2, perm3_mask_low);
5881           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5882
5883           vect1 = data_ref;
5884           vect2 = dr_chain[2];
5885           /* Create interleaving stmt:
5886              low = VEC_PERM_EXPR <vect1, vect2,
5887                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5888                                    6, 7, nelt + j + 2, ...}>  */
5889           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5890           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5891                                            vect2, perm3_mask_high);
5892           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5893           (*result_chain)[j] = data_ref;
5894         }
5895     }
5896   else
5897     {
5898       /* If length is not equal to 3 then only power of 2 is supported.  */
5899       gcc_assert (pow2p_hwi (length));
5900
5901       /* The encoding has 2 interleaved stepped patterns.  */
5902       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5903       vec_perm_builder sel (nelt, 2, 3);
5904       sel.quick_grow (6);
5905       for (i = 0; i < 3; i++)
5906         {
5907           sel[i * 2] = i;
5908           sel[i * 2 + 1] = i + nelt;
5909         }
5910         vec_perm_indices indices (sel, 2, nelt);
5911         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5912
5913         for (i = 0; i < 6; i++)
5914           sel[i] += exact_div (nelt, 2);
5915         indices.new_vector (sel, 2, nelt);
5916         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5917
5918         for (i = 0, n = log_length; i < n; i++)
5919           {
5920             for (j = 0; j < length/2; j++)
5921               {
5922                 vect1 = dr_chain[j];
5923                 vect2 = dr_chain[j+length/2];
5924
5925                 /* Create interleaving stmt:
5926                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5927                                                         ...}>  */
5928                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5929                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5930                                                  vect2, perm_mask_high);
5931                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5932                 (*result_chain)[2*j] = high;
5933
5934                 /* Create interleaving stmt:
5935                    low = VEC_PERM_EXPR <vect1, vect2,
5936                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5937                                          ...}>  */
5938                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5939                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5940                                                  vect2, perm_mask_low);
5941                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5942                 (*result_chain)[2*j+1] = low;
5943               }
5944             memcpy (dr_chain.address (), result_chain->address (),
5945                     length * sizeof (tree));
5946           }
5947     }
5948 }
5949
5950 /* Function vect_setup_realignment
5951
5952    This function is called when vectorizing an unaligned load using
5953    the dr_explicit_realign[_optimized] scheme.
5954    This function generates the following code at the loop prolog:
5955
5956       p = initial_addr;
5957    x  msq_init = *(floor(p));   # prolog load
5958       realignment_token = call target_builtin;
5959     loop:
5960    x  msq = phi (msq_init, ---)
5961
5962    The stmts marked with x are generated only for the case of
5963    dr_explicit_realign_optimized.
5964
5965    The code above sets up a new (vector) pointer, pointing to the first
5966    location accessed by STMT_INFO, and a "floor-aligned" load using that
5967    pointer.  It also generates code to compute the "realignment-token"
5968    (if the relevant target hook was defined), and creates a phi-node at the
5969    loop-header bb whose arguments are the result of the prolog-load (created
5970    by this function) and the result of a load that takes place in the loop
5971    (to be created by the caller to this function).
5972
5973    For the case of dr_explicit_realign_optimized:
5974    The caller to this function uses the phi-result (msq) to create the
5975    realignment code inside the loop, and sets up the missing phi argument,
5976    as follows:
5977     loop:
5978       msq = phi (msq_init, lsq)
5979       lsq = *(floor(p'));        # load in loop
5980       result = realign_load (msq, lsq, realignment_token);
5981
5982    For the case of dr_explicit_realign:
5983     loop:
5984       msq = *(floor(p));        # load in loop
5985       p' = p + (VS-1);
5986       lsq = *(floor(p'));       # load in loop
5987       result = realign_load (msq, lsq, realignment_token);
5988
5989    Input:
5990    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5991                a memory location that may be unaligned.
5992    BSI - place where new code is to be inserted.
5993    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5994                               is used.
5995
5996    Output:
5997    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5998                        target hook, if defined.
5999    Return value - the result of the loop-header phi node.  */
6000
6001 tree
6002 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6003                         gimple_stmt_iterator *gsi, tree *realignment_token,
6004                         enum dr_alignment_support alignment_support_scheme,
6005                         tree init_addr,
6006                         class loop **at_loop)
6007 {
6008   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6009   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6010   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6011   struct data_reference *dr = dr_info->dr;
6012   class loop *loop = NULL;
6013   edge pe = NULL;
6014   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6015   tree vec_dest;
6016   gimple *inc;
6017   tree ptr;
6018   tree data_ref;
6019   basic_block new_bb;
6020   tree msq_init = NULL_TREE;
6021   tree new_temp;
6022   gphi *phi_stmt;
6023   tree msq = NULL_TREE;
6024   gimple_seq stmts = NULL;
6025   bool compute_in_loop = false;
6026   bool nested_in_vect_loop = false;
6027   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6028   class loop *loop_for_initial_load = NULL;
6029
6030   if (loop_vinfo)
6031     {
6032       loop = LOOP_VINFO_LOOP (loop_vinfo);
6033       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6034     }
6035
6036   gcc_assert (alignment_support_scheme == dr_explicit_realign
6037               || alignment_support_scheme == dr_explicit_realign_optimized);
6038
6039   /* We need to generate three things:
6040      1. the misalignment computation
6041      2. the extra vector load (for the optimized realignment scheme).
6042      3. the phi node for the two vectors from which the realignment is
6043       done (for the optimized realignment scheme).  */
6044
6045   /* 1. Determine where to generate the misalignment computation.
6046
6047      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6048      calculation will be generated by this function, outside the loop (in the
6049      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6050      caller, inside the loop.
6051
6052      Background: If the misalignment remains fixed throughout the iterations of
6053      the loop, then both realignment schemes are applicable, and also the
6054      misalignment computation can be done outside LOOP.  This is because we are
6055      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6056      are a multiple of VS (the Vector Size), and therefore the misalignment in
6057      different vectorized LOOP iterations is always the same.
6058      The problem arises only if the memory access is in an inner-loop nested
6059      inside LOOP, which is now being vectorized using outer-loop vectorization.
6060      This is the only case when the misalignment of the memory access may not
6061      remain fixed throughout the iterations of the inner-loop (as explained in
6062      detail in vect_supportable_dr_alignment).  In this case, not only is the
6063      optimized realignment scheme not applicable, but also the misalignment
6064      computation (and generation of the realignment token that is passed to
6065      REALIGN_LOAD) have to be done inside the loop.
6066
6067      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6068      or not, which in turn determines if the misalignment is computed inside
6069      the inner-loop, or outside LOOP.  */
6070
6071   if (init_addr != NULL_TREE || !loop_vinfo)
6072     {
6073       compute_in_loop = true;
6074       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6075     }
6076
6077
6078   /* 2. Determine where to generate the extra vector load.
6079
6080      For the optimized realignment scheme, instead of generating two vector
6081      loads in each iteration, we generate a single extra vector load in the
6082      preheader of the loop, and in each iteration reuse the result of the
6083      vector load from the previous iteration.  In case the memory access is in
6084      an inner-loop nested inside LOOP, which is now being vectorized using
6085      outer-loop vectorization, we need to determine whether this initial vector
6086      load should be generated at the preheader of the inner-loop, or can be
6087      generated at the preheader of LOOP.  If the memory access has no evolution
6088      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6089      to be generated inside LOOP (in the preheader of the inner-loop).  */
6090
6091   if (nested_in_vect_loop)
6092     {
6093       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6094       bool invariant_in_outerloop =
6095             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6096       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6097     }
6098   else
6099     loop_for_initial_load = loop;
6100   if (at_loop)
6101     *at_loop = loop_for_initial_load;
6102
6103   tree vuse = NULL_TREE;
6104   if (loop_for_initial_load)
6105     {
6106       pe = loop_preheader_edge (loop_for_initial_load);
6107       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6108         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6109     }
6110   if (!vuse)
6111     vuse = gimple_vuse (gsi_stmt (*gsi));
6112
6113   /* 3. For the case of the optimized realignment, create the first vector
6114       load at the loop preheader.  */
6115
6116   if (alignment_support_scheme == dr_explicit_realign_optimized)
6117     {
6118       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6119       gassign *new_stmt;
6120
6121       gcc_assert (!compute_in_loop);
6122       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6123       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6124                                       loop_for_initial_load, NULL_TREE,
6125                                       &init_addr, NULL, &inc, true);
6126       if (TREE_CODE (ptr) == SSA_NAME)
6127         new_temp = copy_ssa_name (ptr);
6128       else
6129         new_temp = make_ssa_name (TREE_TYPE (ptr));
6130       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6131       tree type = TREE_TYPE (ptr);
6132       new_stmt = gimple_build_assign
6133                    (new_temp, BIT_AND_EXPR, ptr,
6134                     fold_build2 (MINUS_EXPR, type,
6135                                  build_int_cst (type, 0),
6136                                  build_int_cst (type, align)));
6137       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6138       gcc_assert (!new_bb);
6139       data_ref
6140         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6141                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6142       vect_copy_ref_info (data_ref, DR_REF (dr));
6143       new_stmt = gimple_build_assign (vec_dest, data_ref);
6144       new_temp = make_ssa_name (vec_dest, new_stmt);
6145       gimple_assign_set_lhs (new_stmt, new_temp);
6146       gimple_set_vuse (new_stmt, vuse);
6147       if (pe)
6148         {
6149           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6150           gcc_assert (!new_bb);
6151         }
6152       else
6153          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6154
6155       msq_init = gimple_assign_lhs (new_stmt);
6156     }
6157
6158   /* 4. Create realignment token using a target builtin, if available.
6159       It is done either inside the containing loop, or before LOOP (as
6160       determined above).  */
6161
6162   if (targetm.vectorize.builtin_mask_for_load)
6163     {
6164       gcall *new_stmt;
6165       tree builtin_decl;
6166
6167       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6168       if (!init_addr)
6169         {
6170           /* Generate the INIT_ADDR computation outside LOOP.  */
6171           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6172                                                             stmt_info, &stmts,
6173                                                             NULL_TREE);
6174           if (loop)
6175             {
6176               pe = loop_preheader_edge (loop);
6177               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6178               gcc_assert (!new_bb);
6179             }
6180           else
6181              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6182         }
6183
6184       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6185       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6186       vec_dest =
6187         vect_create_destination_var (scalar_dest,
6188                                      gimple_call_return_type (new_stmt));
6189       new_temp = make_ssa_name (vec_dest, new_stmt);
6190       gimple_call_set_lhs (new_stmt, new_temp);
6191
6192       if (compute_in_loop)
6193         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6194       else
6195         {
6196           /* Generate the misalignment computation outside LOOP.  */
6197           pe = loop_preheader_edge (loop);
6198           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6199           gcc_assert (!new_bb);
6200         }
6201
6202       *realignment_token = gimple_call_lhs (new_stmt);
6203
6204       /* The result of the CALL_EXPR to this builtin is determined from
6205          the value of the parameter and no global variables are touched
6206          which makes the builtin a "const" function.  Requiring the
6207          builtin to have the "const" attribute makes it unnecessary
6208          to call mark_call_clobbered.  */
6209       gcc_assert (TREE_READONLY (builtin_decl));
6210     }
6211
6212   if (alignment_support_scheme == dr_explicit_realign)
6213     return msq;
6214
6215   gcc_assert (!compute_in_loop);
6216   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6217
6218
6219   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6220
6221   pe = loop_preheader_edge (containing_loop);
6222   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6223   msq = make_ssa_name (vec_dest);
6224   phi_stmt = create_phi_node (msq, containing_loop->header);
6225   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6226
6227   return msq;
6228 }
6229
6230
6231 /* Function vect_grouped_load_supported.
6232
6233    COUNT is the size of the load group (the number of statements plus the
6234    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6235    only one statement, with a gap of COUNT - 1.
6236
6237    Returns true if a suitable permute exists.  */
6238
6239 bool
6240 vect_grouped_load_supported (tree vectype, bool single_element_p,
6241                              unsigned HOST_WIDE_INT count)
6242 {
6243   machine_mode mode = TYPE_MODE (vectype);
6244
6245   /* If this is single-element interleaving with an element distance
6246      that leaves unused vector loads around punt - we at least create
6247      very sub-optimal code in that case (and blow up memory,
6248      see PR65518).  */
6249   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6250     {
6251       if (dump_enabled_p ())
6252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253                          "single-element interleaving not supported "
6254                          "for not adjacent vector loads\n");
6255       return false;
6256     }
6257
6258   /* vect_permute_load_chain requires the group size to be equal to 3 or
6259      be a power of two.  */
6260   if (count != 3 && exact_log2 (count) == -1)
6261     {
6262       if (dump_enabled_p ())
6263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264                          "the size of the group of accesses"
6265                          " is not a power of 2 or not equal to 3\n");
6266       return false;
6267     }
6268
6269   /* Check that the permutation is supported.  */
6270   if (VECTOR_MODE_P (mode))
6271     {
6272       unsigned int i, j;
6273       if (count == 3)
6274         {
6275           unsigned int nelt;
6276           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6277             {
6278               if (dump_enabled_p ())
6279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280                                  "cannot handle groups of 3 loads for"
6281                                  " variable-length vectors\n");
6282               return false;
6283             }
6284
6285           vec_perm_builder sel (nelt, nelt, 1);
6286           sel.quick_grow (nelt);
6287           vec_perm_indices indices;
6288           unsigned int k;
6289           for (k = 0; k < 3; k++)
6290             {
6291               for (i = 0; i < nelt; i++)
6292                 if (3 * i + k < 2 * nelt)
6293                   sel[i] = 3 * i + k;
6294                 else
6295                   sel[i] = 0;
6296               indices.new_vector (sel, 2, nelt);
6297               if (!can_vec_perm_const_p (mode, mode, indices))
6298                 {
6299                   if (dump_enabled_p ())
6300                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6301                                      "shuffle of 3 loads is not supported by"
6302                                      " target\n");
6303                   return false;
6304                 }
6305               for (i = 0, j = 0; i < nelt; i++)
6306                 if (3 * i + k < 2 * nelt)
6307                   sel[i] = i;
6308                 else
6309                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6310               indices.new_vector (sel, 2, nelt);
6311               if (!can_vec_perm_const_p (mode, mode, indices))
6312                 {
6313                   if (dump_enabled_p ())
6314                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6315                                      "shuffle of 3 loads is not supported by"
6316                                      " target\n");
6317                   return false;
6318                 }
6319             }
6320           return true;
6321         }
6322       else
6323         {
6324           /* If length is not equal to 3 then only power of 2 is supported.  */
6325           gcc_assert (pow2p_hwi (count));
6326           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6327
6328           /* The encoding has a single stepped pattern.  */
6329           vec_perm_builder sel (nelt, 1, 3);
6330           sel.quick_grow (3);
6331           for (i = 0; i < 3; i++)
6332             sel[i] = i * 2;
6333           vec_perm_indices indices (sel, 2, nelt);
6334           if (can_vec_perm_const_p (mode, mode, indices))
6335             {
6336               for (i = 0; i < 3; i++)
6337                 sel[i] = i * 2 + 1;
6338               indices.new_vector (sel, 2, nelt);
6339               if (can_vec_perm_const_p (mode, mode, indices))
6340                 return true;
6341             }
6342         }
6343     }
6344
6345   if (dump_enabled_p ())
6346     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6347                      "extract even/odd not supported by target\n");
6348   return false;
6349 }
6350
6351 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6352    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6353
6354 internal_fn
6355 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6356                            bool masked_p)
6357 {
6358   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6359                                     vec_mask_len_load_lanes_optab, vectype,
6360                                     count))
6361     return IFN_MASK_LEN_LOAD_LANES;
6362   else if (masked_p)
6363     {
6364       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6365                                         vec_mask_load_lanes_optab, vectype,
6366                                         count))
6367         return IFN_MASK_LOAD_LANES;
6368     }
6369   else
6370     {
6371       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6372                                         vectype, count))
6373         return IFN_LOAD_LANES;
6374     }
6375   return IFN_LAST;
6376 }
6377
6378 /* Function vect_permute_load_chain.
6379
6380    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6381    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6382    the input data correctly.  Return the final references for loads in
6383    RESULT_CHAIN.
6384
6385    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6386    The input is 4 vectors each containing 8 elements. We assign a number to each
6387    element, the input sequence is:
6388
6389    1st vec:   0  1  2  3  4  5  6  7
6390    2nd vec:   8  9 10 11 12 13 14 15
6391    3rd vec:  16 17 18 19 20 21 22 23
6392    4th vec:  24 25 26 27 28 29 30 31
6393
6394    The output sequence should be:
6395
6396    1st vec:  0 4  8 12 16 20 24 28
6397    2nd vec:  1 5  9 13 17 21 25 29
6398    3rd vec:  2 6 10 14 18 22 26 30
6399    4th vec:  3 7 11 15 19 23 27 31
6400
6401    i.e., the first output vector should contain the first elements of each
6402    interleaving group, etc.
6403
6404    We use extract_even/odd instructions to create such output.  The input of
6405    each extract_even/odd operation is two vectors
6406    1st vec    2nd vec
6407    0 1 2 3    4 5 6 7
6408
6409    and the output is the vector of extracted even/odd elements.  The output of
6410    extract_even will be:   0 2 4 6
6411    and of extract_odd:     1 3 5 7
6412
6413
6414    The permutation is done in log LENGTH stages.  In each stage extract_even
6415    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6416    their order.  In our example,
6417
6418    E1: extract_even (1st vec, 2nd vec)
6419    E2: extract_odd (1st vec, 2nd vec)
6420    E3: extract_even (3rd vec, 4th vec)
6421    E4: extract_odd (3rd vec, 4th vec)
6422
6423    The output for the first stage will be:
6424
6425    E1:  0  2  4  6  8 10 12 14
6426    E2:  1  3  5  7  9 11 13 15
6427    E3: 16 18 20 22 24 26 28 30
6428    E4: 17 19 21 23 25 27 29 31
6429
6430    In order to proceed and create the correct sequence for the next stage (or
6431    for the correct output, if the second stage is the last one, as in our
6432    example), we first put the output of extract_even operation and then the
6433    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6434    The input for the second stage is:
6435
6436    1st vec (E1):  0  2  4  6  8 10 12 14
6437    2nd vec (E3): 16 18 20 22 24 26 28 30
6438    3rd vec (E2):  1  3  5  7  9 11 13 15
6439    4th vec (E4): 17 19 21 23 25 27 29 31
6440
6441    The output of the second stage:
6442
6443    E1: 0 4  8 12 16 20 24 28
6444    E2: 2 6 10 14 18 22 26 30
6445    E3: 1 5  9 13 17 21 25 29
6446    E4: 3 7 11 15 19 23 27 31
6447
6448    And RESULT_CHAIN after reordering:
6449
6450    1st vec (E1):  0 4  8 12 16 20 24 28
6451    2nd vec (E3):  1 5  9 13 17 21 25 29
6452    3rd vec (E2):  2 6 10 14 18 22 26 30
6453    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6454
6455 static void
6456 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6457                          unsigned int length,
6458                          stmt_vec_info stmt_info,
6459                          gimple_stmt_iterator *gsi,
6460                          vec<tree> *result_chain)
6461 {
6462   tree data_ref, first_vect, second_vect;
6463   tree perm_mask_even, perm_mask_odd;
6464   tree perm3_mask_low, perm3_mask_high;
6465   gimple *perm_stmt;
6466   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6467   unsigned int i, j, log_length = exact_log2 (length);
6468
6469   result_chain->quick_grow (length);
6470   memcpy (result_chain->address (), dr_chain.address (),
6471           length * sizeof (tree));
6472
6473   if (length == 3)
6474     {
6475       /* vect_grouped_load_supported ensures that this is constant.  */
6476       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6477       unsigned int k;
6478
6479       vec_perm_builder sel (nelt, nelt, 1);
6480       sel.quick_grow (nelt);
6481       vec_perm_indices indices;
6482       for (k = 0; k < 3; k++)
6483         {
6484           for (i = 0; i < nelt; i++)
6485             if (3 * i + k < 2 * nelt)
6486               sel[i] = 3 * i + k;
6487             else
6488               sel[i] = 0;
6489           indices.new_vector (sel, 2, nelt);
6490           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6491
6492           for (i = 0, j = 0; i < nelt; i++)
6493             if (3 * i + k < 2 * nelt)
6494               sel[i] = i;
6495             else
6496               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6497           indices.new_vector (sel, 2, nelt);
6498           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6499
6500           first_vect = dr_chain[0];
6501           second_vect = dr_chain[1];
6502
6503           /* Create interleaving stmt (low part of):
6504              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6505                                                              ...}>  */
6506           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6507           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6508                                            second_vect, perm3_mask_low);
6509           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6510
6511           /* Create interleaving stmt (high part of):
6512              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6513                                                               ...}>  */
6514           first_vect = data_ref;
6515           second_vect = dr_chain[2];
6516           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6517           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6518                                            second_vect, perm3_mask_high);
6519           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6520           (*result_chain)[k] = data_ref;
6521         }
6522     }
6523   else
6524     {
6525       /* If length is not equal to 3 then only power of 2 is supported.  */
6526       gcc_assert (pow2p_hwi (length));
6527
6528       /* The encoding has a single stepped pattern.  */
6529       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6530       vec_perm_builder sel (nelt, 1, 3);
6531       sel.quick_grow (3);
6532       for (i = 0; i < 3; ++i)
6533         sel[i] = i * 2;
6534       vec_perm_indices indices (sel, 2, nelt);
6535       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6536
6537       for (i = 0; i < 3; ++i)
6538         sel[i] = i * 2 + 1;
6539       indices.new_vector (sel, 2, nelt);
6540       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6541
6542       for (i = 0; i < log_length; i++)
6543         {
6544           for (j = 0; j < length; j += 2)
6545             {
6546               first_vect = dr_chain[j];
6547               second_vect = dr_chain[j+1];
6548
6549               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6550               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6551               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6552                                                first_vect, second_vect,
6553                                                perm_mask_even);
6554               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6555               (*result_chain)[j/2] = data_ref;
6556
6557               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6558               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6559               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6560                                                first_vect, second_vect,
6561                                                perm_mask_odd);
6562               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6563               (*result_chain)[j/2+length/2] = data_ref;
6564             }
6565           memcpy (dr_chain.address (), result_chain->address (),
6566                   length * sizeof (tree));
6567         }
6568     }
6569 }
6570
6571 /* Function vect_shift_permute_load_chain.
6572
6573    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6574    sequence of stmts to reorder the input data accordingly.
6575    Return the final references for loads in RESULT_CHAIN.
6576    Return true if successed, false otherwise.
6577
6578    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6579    The input is 3 vectors each containing 8 elements.  We assign a
6580    number to each element, the input sequence is:
6581
6582    1st vec:   0  1  2  3  4  5  6  7
6583    2nd vec:   8  9 10 11 12 13 14 15
6584    3rd vec:  16 17 18 19 20 21 22 23
6585
6586    The output sequence should be:
6587
6588    1st vec:  0 3 6  9 12 15 18 21
6589    2nd vec:  1 4 7 10 13 16 19 22
6590    3rd vec:  2 5 8 11 14 17 20 23
6591
6592    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6593
6594    First we shuffle all 3 vectors to get correct elements order:
6595
6596    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6597    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6598    3rd vec:  (16 19 22) (17 20 23) (18 21)
6599
6600    Next we unite and shift vector 3 times:
6601
6602    1st step:
6603      shift right by 6 the concatenation of:
6604      "1st vec" and  "2nd vec"
6605        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6606      "2nd vec" and  "3rd vec"
6607        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6608      "3rd vec" and  "1st vec"
6609        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6610                              | New vectors                   |
6611
6612      So that now new vectors are:
6613
6614      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6615      2nd vec:  (10 13) (16 19 22) (17 20 23)
6616      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6617
6618    2nd step:
6619      shift right by 5 the concatenation of:
6620      "1st vec" and  "3rd vec"
6621        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6622      "2nd vec" and  "1st vec"
6623        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6624      "3rd vec" and  "2nd vec"
6625        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6626                           | New vectors                   |
6627
6628      So that now new vectors are:
6629
6630      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6631      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6632      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6633
6634    3rd step:
6635      shift right by 5 the concatenation of:
6636      "1st vec" and  "1st vec"
6637        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6638      shift right by 3 the concatenation of:
6639      "2nd vec" and  "2nd vec"
6640                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6641                           | New vectors                   |
6642
6643      So that now all vectors are READY:
6644      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6645      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6646      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6647
6648    This algorithm is faster than one in vect_permute_load_chain if:
6649      1.  "shift of a concatination" is faster than general permutation.
6650          This is usually so.
6651      2.  The TARGET machine can't execute vector instructions in parallel.
6652          This is because each step of the algorithm depends on previous.
6653          The algorithm in vect_permute_load_chain is much more parallel.
6654
6655    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6656 */
6657
6658 static bool
6659 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6660                                unsigned int length,
6661                                stmt_vec_info stmt_info,
6662                                gimple_stmt_iterator *gsi,
6663                                vec<tree> *result_chain)
6664 {
6665   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6666   tree perm2_mask1, perm2_mask2, perm3_mask;
6667   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6668   gimple *perm_stmt;
6669
6670   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6671   machine_mode vmode = TYPE_MODE (vectype);
6672   unsigned int i;
6673   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6674
6675   unsigned HOST_WIDE_INT nelt, vf;
6676   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6677       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6678     /* Not supported for variable-length vectors.  */
6679     return false;
6680
6681   vec_perm_builder sel (nelt, nelt, 1);
6682   sel.quick_grow (nelt);
6683
6684   result_chain->quick_grow (length);
6685   memcpy (result_chain->address (), dr_chain.address (),
6686           length * sizeof (tree));
6687
6688   if (pow2p_hwi (length) && vf > 4)
6689     {
6690       unsigned int j, log_length = exact_log2 (length);
6691       for (i = 0; i < nelt / 2; ++i)
6692         sel[i] = i * 2;
6693       for (i = 0; i < nelt / 2; ++i)
6694         sel[nelt / 2 + i] = i * 2 + 1;
6695       vec_perm_indices indices (sel, 2, nelt);
6696       if (!can_vec_perm_const_p (vmode, vmode, indices))
6697         {
6698           if (dump_enabled_p ())
6699             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                              "shuffle of 2 fields structure is not \
6701                               supported by target\n");
6702           return false;
6703         }
6704       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6705
6706       for (i = 0; i < nelt / 2; ++i)
6707         sel[i] = i * 2 + 1;
6708       for (i = 0; i < nelt / 2; ++i)
6709         sel[nelt / 2 + i] = i * 2;
6710       indices.new_vector (sel, 2, nelt);
6711       if (!can_vec_perm_const_p (vmode, vmode, indices))
6712         {
6713           if (dump_enabled_p ())
6714             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6715                              "shuffle of 2 fields structure is not \
6716                               supported by target\n");
6717           return false;
6718         }
6719       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6720
6721       /* Generating permutation constant to shift all elements.
6722          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6723       for (i = 0; i < nelt; i++)
6724         sel[i] = nelt / 2 + i;
6725       indices.new_vector (sel, 2, nelt);
6726       if (!can_vec_perm_const_p (vmode, vmode, indices))
6727         {
6728           if (dump_enabled_p ())
6729             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730                              "shift permutation is not supported by target\n");
6731           return false;
6732         }
6733       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6734
6735       /* Generating permutation constant to select vector from 2.
6736          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6737       for (i = 0; i < nelt / 2; i++)
6738         sel[i] = i;
6739       for (i = nelt / 2; i < nelt; i++)
6740         sel[i] = nelt + i;
6741       indices.new_vector (sel, 2, nelt);
6742       if (!can_vec_perm_const_p (vmode, vmode, indices))
6743         {
6744           if (dump_enabled_p ())
6745             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746                              "select is not supported by target\n");
6747           return false;
6748         }
6749       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6750
6751       for (i = 0; i < log_length; i++)
6752         {
6753           for (j = 0; j < length; j += 2)
6754             {
6755               first_vect = dr_chain[j];
6756               second_vect = dr_chain[j + 1];
6757
6758               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6759               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6760                                                first_vect, first_vect,
6761                                                perm2_mask1);
6762               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6763               vect[0] = data_ref;
6764
6765               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6766               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6767                                                second_vect, second_vect,
6768                                                perm2_mask2);
6769               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6770               vect[1] = data_ref;
6771
6772               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6773               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6774                                                vect[0], vect[1], shift1_mask);
6775               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6776               (*result_chain)[j/2 + length/2] = data_ref;
6777
6778               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6779               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6780                                                vect[0], vect[1], select_mask);
6781               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6782               (*result_chain)[j/2] = data_ref;
6783             }
6784           memcpy (dr_chain.address (), result_chain->address (),
6785                   length * sizeof (tree));
6786         }
6787       return true;
6788     }
6789   if (length == 3 && vf > 2)
6790     {
6791       unsigned int k = 0, l = 0;
6792
6793       /* Generating permutation constant to get all elements in rigth order.
6794          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6795       for (i = 0; i < nelt; i++)
6796         {
6797           if (3 * k + (l % 3) >= nelt)
6798             {
6799               k = 0;
6800               l += (3 - (nelt % 3));
6801             }
6802           sel[i] = 3 * k + (l % 3);
6803           k++;
6804         }
6805       vec_perm_indices indices (sel, 2, nelt);
6806       if (!can_vec_perm_const_p (vmode, vmode, indices))
6807         {
6808           if (dump_enabled_p ())
6809             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                              "shuffle of 3 fields structure is not \
6811                               supported by target\n");
6812           return false;
6813         }
6814       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6815
6816       /* Generating permutation constant to shift all elements.
6817          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6818       for (i = 0; i < nelt; i++)
6819         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6820       indices.new_vector (sel, 2, nelt);
6821       if (!can_vec_perm_const_p (vmode, vmode, indices))
6822         {
6823           if (dump_enabled_p ())
6824             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6825                              "shift permutation is not supported by target\n");
6826           return false;
6827         }
6828       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6829
6830       /* Generating permutation constant to shift all elements.
6831          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6832       for (i = 0; i < nelt; i++)
6833         sel[i] = 2 * (nelt / 3) + 1 + i;
6834       indices.new_vector (sel, 2, nelt);
6835       if (!can_vec_perm_const_p (vmode, vmode, indices))
6836         {
6837           if (dump_enabled_p ())
6838             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839                              "shift permutation is not supported by target\n");
6840           return false;
6841         }
6842       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6843
6844       /* Generating permutation constant to shift all elements.
6845          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6846       for (i = 0; i < nelt; i++)
6847         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6848       indices.new_vector (sel, 2, nelt);
6849       if (!can_vec_perm_const_p (vmode, vmode, indices))
6850         {
6851           if (dump_enabled_p ())
6852             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6853                              "shift permutation is not supported by target\n");
6854           return false;
6855         }
6856       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6857
6858       /* Generating permutation constant to shift all elements.
6859          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6860       for (i = 0; i < nelt; i++)
6861         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6862       indices.new_vector (sel, 2, nelt);
6863       if (!can_vec_perm_const_p (vmode, vmode, indices))
6864         {
6865           if (dump_enabled_p ())
6866             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867                              "shift permutation is not supported by target\n");
6868           return false;
6869         }
6870       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6871
6872       for (k = 0; k < 3; k++)
6873         {
6874           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6875           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6876                                            dr_chain[k], dr_chain[k],
6877                                            perm3_mask);
6878           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6879           vect[k] = data_ref;
6880         }
6881
6882       for (k = 0; k < 3; k++)
6883         {
6884           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6885           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6886                                            vect[k % 3], vect[(k + 1) % 3],
6887                                            shift1_mask);
6888           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6889           vect_shift[k] = data_ref;
6890         }
6891
6892       for (k = 0; k < 3; k++)
6893         {
6894           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6895           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6896                                            vect_shift[(4 - k) % 3],
6897                                            vect_shift[(3 - k) % 3],
6898                                            shift2_mask);
6899           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6900           vect[k] = data_ref;
6901         }
6902
6903       (*result_chain)[3 - (nelt % 3)] = vect[2];
6904
6905       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6906       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6907                                        vect[0], shift3_mask);
6908       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6909       (*result_chain)[nelt % 3] = data_ref;
6910
6911       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6912       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6913                                        vect[1], shift4_mask);
6914       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6915       (*result_chain)[0] = data_ref;
6916       return true;
6917     }
6918   return false;
6919 }
6920
6921 /* Function vect_transform_grouped_load.
6922
6923    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6924    to perform their permutation and ascribe the result vectorized statements to
6925    the scalar statements.
6926 */
6927
6928 void
6929 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6930                              vec<tree> dr_chain,
6931                              int size, gimple_stmt_iterator *gsi)
6932 {
6933   machine_mode mode;
6934   vec<tree> result_chain = vNULL;
6935
6936   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6937      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6938      vectors, that are ready for vector computation.  */
6939   result_chain.create (size);
6940
6941   /* If reassociation width for vector type is 2 or greater target machine can
6942      execute 2 or more vector instructions in parallel.  Otherwise try to
6943      get chain for loads group using vect_shift_permute_load_chain.  */
6944   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6945   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6946       || pow2p_hwi (size)
6947       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6948                                          gsi, &result_chain))
6949     vect_permute_load_chain (vinfo, dr_chain,
6950                              size, stmt_info, gsi, &result_chain);
6951   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6952   result_chain.release ();
6953 }
6954
6955 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6956    generated as part of the vectorization of STMT_INFO.  Assign the statement
6957    for each vector to the associated scalar statement.  */
6958
6959 void
6960 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6961                                   vec<tree> result_chain)
6962 {
6963   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6964   unsigned int i, gap_count;
6965   tree tmp_data_ref;
6966
6967   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6968      Since we scan the chain starting from it's first node, their order
6969      corresponds the order of data-refs in RESULT_CHAIN.  */
6970   stmt_vec_info next_stmt_info = first_stmt_info;
6971   gap_count = 1;
6972   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6973     {
6974       if (!next_stmt_info)
6975         break;
6976
6977       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6978        code elimination pass later.  No need to check for the first stmt in
6979        the group, since it always exists.
6980        DR_GROUP_GAP is the number of steps in elements from the previous
6981        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6982        correspond to the gaps.  */
6983       if (next_stmt_info != first_stmt_info
6984           && gap_count < DR_GROUP_GAP (next_stmt_info))
6985         {
6986           gap_count++;
6987           continue;
6988         }
6989
6990       /* ???  The following needs cleanup after the removal of
6991          DR_GROUP_SAME_DR_STMT.  */
6992       if (next_stmt_info)
6993         {
6994           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6995           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6996              copies, and we put the new vector statement last.  */
6997           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6998
6999           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7000           gap_count = 1;
7001         }
7002     }
7003 }
7004
7005 /* Function vect_force_dr_alignment_p.
7006
7007    Returns whether the alignment of a DECL can be forced to be aligned
7008    on ALIGNMENT bit boundary.  */
7009
7010 bool
7011 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7012 {
7013   if (!VAR_P (decl))
7014     return false;
7015
7016   if (decl_in_symtab_p (decl)
7017       && !symtab_node::get (decl)->can_increase_alignment_p ())
7018     return false;
7019
7020   if (TREE_STATIC (decl))
7021     return (known_le (alignment,
7022                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7023   else
7024     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7025 }
7026
7027 /* Return whether the data reference DR_INFO is supported with respect to its
7028    alignment.
7029    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7030    it is aligned, i.e., check if it is possible to vectorize it with different
7031    alignment.  */
7032
7033 enum dr_alignment_support
7034 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7035                                tree vectype, int misalignment)
7036 {
7037   data_reference *dr = dr_info->dr;
7038   stmt_vec_info stmt_info = dr_info->stmt;
7039   machine_mode mode = TYPE_MODE (vectype);
7040   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7041   class loop *vect_loop = NULL;
7042   bool nested_in_vect_loop = false;
7043
7044   if (misalignment == 0)
7045     return dr_aligned;
7046
7047   /* For now assume all conditional loads/stores support unaligned
7048      access without any special code.  */
7049   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7050     if (gimple_call_internal_p (stmt)
7051         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7052             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7053       return dr_unaligned_supported;
7054
7055   if (loop_vinfo)
7056     {
7057       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7058       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7059     }
7060
7061   /* Possibly unaligned access.  */
7062
7063   /* We can choose between using the implicit realignment scheme (generating
7064      a misaligned_move stmt) and the explicit realignment scheme (generating
7065      aligned loads with a REALIGN_LOAD).  There are two variants to the
7066      explicit realignment scheme: optimized, and unoptimized.
7067      We can optimize the realignment only if the step between consecutive
7068      vector loads is equal to the vector size.  Since the vector memory
7069      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7070      is guaranteed that the misalignment amount remains the same throughout the
7071      execution of the vectorized loop.  Therefore, we can create the
7072      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7073      at the loop preheader.
7074
7075      However, in the case of outer-loop vectorization, when vectorizing a
7076      memory access in the inner-loop nested within the LOOP that is now being
7077      vectorized, while it is guaranteed that the misalignment of the
7078      vectorized memory access will remain the same in different outer-loop
7079      iterations, it is *not* guaranteed that is will remain the same throughout
7080      the execution of the inner-loop.  This is because the inner-loop advances
7081      with the original scalar step (and not in steps of VS).  If the inner-loop
7082      step happens to be a multiple of VS, then the misalignment remains fixed
7083      and we can use the optimized realignment scheme.  For example:
7084
7085       for (i=0; i<N; i++)
7086         for (j=0; j<M; j++)
7087           s += a[i+j];
7088
7089      When vectorizing the i-loop in the above example, the step between
7090      consecutive vector loads is 1, and so the misalignment does not remain
7091      fixed across the execution of the inner-loop, and the realignment cannot
7092      be optimized (as illustrated in the following pseudo vectorized loop):
7093
7094       for (i=0; i<N; i+=4)
7095         for (j=0; j<M; j++){
7096           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7097                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7098                          // (assuming that we start from an aligned address).
7099           }
7100
7101      We therefore have to use the unoptimized realignment scheme:
7102
7103       for (i=0; i<N; i+=4)
7104           for (j=k; j<M; j+=4)
7105           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7106                            // that the misalignment of the initial address is
7107                            // 0).
7108
7109      The loop can then be vectorized as follows:
7110
7111       for (k=0; k<4; k++){
7112         rt = get_realignment_token (&vp[k]);
7113         for (i=0; i<N; i+=4){
7114           v1 = vp[i+k];
7115           for (j=k; j<M; j+=4){
7116             v2 = vp[i+j+VS-1];
7117             va = REALIGN_LOAD <v1,v2,rt>;
7118             vs += va;
7119             v1 = v2;
7120           }
7121         }
7122     } */
7123
7124   if (DR_IS_READ (dr))
7125     {
7126       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7127           && (!targetm.vectorize.builtin_mask_for_load
7128               || targetm.vectorize.builtin_mask_for_load ()))
7129         {
7130           /* If we are doing SLP then the accesses need not have the
7131              same alignment, instead it depends on the SLP group size.  */
7132           if (loop_vinfo
7133               && STMT_SLP_TYPE (stmt_info)
7134               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7135                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7136                                   * (DR_GROUP_SIZE
7137                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7138                                   TYPE_VECTOR_SUBPARTS (vectype))))
7139             ;
7140           else if (!loop_vinfo
7141                    || (nested_in_vect_loop
7142                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7143                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7144             return dr_explicit_realign;
7145           else
7146             return dr_explicit_realign_optimized;
7147         }
7148     }
7149
7150   bool is_packed = false;
7151   tree type = TREE_TYPE (DR_REF (dr));
7152   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7153     is_packed = not_size_aligned (DR_REF (dr));
7154   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7155                                                      is_packed))
7156     return dr_unaligned_supported;
7157
7158   /* Unsupported.  */
7159   return dr_unaligned_unsupported;
7160 }