gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* If there is a loop invariant read involved we might vectorize it in
 286      the prologue, breaking scalar oder with respect to the in-loop store.  */
 287   if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
 288       || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
 289     return false;
 290
 291   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 292      emitted at the position of the first scalar load.
 293      Stores in a group are emitted at the position of the last scalar store.
 294      Compute that position and check whether the resulting order matches
 295      the current one.  */
 296   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 297   if (il_a)
 298     {
 299       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 300         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 301              s = DR_GROUP_NEXT_ELEMENT (s))
 302           il_a = get_later_stmt (il_a, s);
 303       else /* DR_IS_READ */
 304         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 305              s = DR_GROUP_NEXT_ELEMENT (s))
 306           if (get_later_stmt (il_a, s) == il_a)
 307             il_a = s;
 308     }
 309   else
 310     il_a = stmtinfo_a;
 311   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 312   if (il_b)
 313     {
 314       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 315         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 316              s = DR_GROUP_NEXT_ELEMENT (s))
 317           il_b = get_later_stmt (il_b, s);
 318       else /* DR_IS_READ */
 319         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 320              s = DR_GROUP_NEXT_ELEMENT (s))
 321           if (get_later_stmt (il_b, s) == il_b)
 322             il_b = s;
 323     }
 324   else
 325     il_b = stmtinfo_b;
 326   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 327   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 328 }
 329
 330 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 331    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 332    distances.  These distances are conservatively correct but they don't
 333    reflect a guaranteed dependence.
 334
 335    Return true if this function does all the work necessary to avoid
 336    an alias or false if the caller should use the dependence distances
 337    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 338    the depth of the loop described by LOOP_VINFO and the other arguments
 339    are as for vect_analyze_data_ref_dependence.  */
 340
 341 static bool
 342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 343                                        loop_vec_info loop_vinfo,
 344                                        int loop_depth, unsigned int *max_vf)
 345 {
 346   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 347   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 348     {
 349       int dist = dist_v[loop_depth];
 350       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 351         {
 352           /* If the user asserted safelen >= DIST consecutive iterations
 353              can be executed concurrently, assume independence.
 354
 355              ??? An alternative would be to add the alias check even
 356              in this case, and vectorize the fallback loop with the
 357              maximum VF set to safelen.  However, if the user has
 358              explicitly given a length, it's less likely that that
 359              would be a win.  */
 360           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 361             {
 362               if ((unsigned int) loop->safelen < *max_vf)
 363                 *max_vf = loop->safelen;
 364               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 365               continue;
 366             }
 367
 368           /* For dependence distances of 2 or more, we have the option
 369              of limiting VF or checking for an alias at runtime.
 370              Prefer to check at runtime if we can, to avoid limiting
 371              the VF unnecessarily when the bases are in fact independent.
 372
 373              Note that the alias checks will be removed if the VF ends up
 374              being small enough.  */
 375           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 376           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 377           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 378                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 379                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 380         }
 381     }
 382   return true;
 383 }
 384
 385
 386 /* Function vect_analyze_data_ref_dependence.
 387
 388    FIXME: I needed to change the sense of the returned flag.
 389
 390    Return FALSE if there (might) exist a dependence between a memory-reference
 391    DRA and a memory-reference DRB.  When versioning for alias may check a
 392    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 393    the data dependence.  */
 394
 395 static opt_result
 396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 397                                   loop_vec_info loop_vinfo,
 398                                   unsigned int *max_vf)
 399 {
 400   unsigned int i;
 401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 402   struct data_reference *dra = DDR_A (ddr);
 403   struct data_reference *drb = DDR_B (ddr);
 404   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 405   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 406   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 407   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 408   lambda_vector dist_v;
 409   unsigned int loop_depth;
 410
 411   /* If user asserted safelen consecutive iterations can be
 412      executed concurrently, assume independence.  */
 413   auto apply_safelen = [&]()
 414     {
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return true;
 421         }
 422       return false;
 423     };
 424
 425   /* In loop analysis all data references should be vectorizable.  */
 426   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 427       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 428     gcc_unreachable ();
 429
 430   /* Independent data accesses.  */
 431   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 432     return opt_result::success ();
 433
 434   if (dra == drb
 435       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 436     return opt_result::success ();
 437
 438   /* We do not have to consider dependences between accesses that belong
 439      to the same group, unless the stride could be smaller than the
 440      group size.  */
 441   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 442       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 443           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 444       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 445     return opt_result::success ();
 446
 447   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 448      least two scalar iterations, there is always also a true dependence.
 449      As the vectorizer does not re-order loads and stores we can ignore
 450      the anti-dependence if TBAA can disambiguate both DRs similar to the
 451      case with known negative distance anti-dependences (positive
 452      distance anti-dependences would violate TBAA constraints).  */
 453   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 454        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 455       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 456                                  get_alias_set (DR_REF (drb))))
 457     return opt_result::success ();
 458
 459   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 460       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 461     {
 462       if (apply_safelen ())
 463         return opt_result::success ();
 464
 465       return opt_result::failure_at
 466         (stmtinfo_a->stmt,
 467          "possible alias involving gather/scatter between %T and %T\n",
 468          DR_REF (dra), DR_REF (drb));
 469     }
 470
 471   /* Unknown data dependence.  */
 472   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 473     {
 474       if (apply_safelen ())
 475         return opt_result::success ();
 476
 477       if (dump_enabled_p ())
 478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 479                          "versioning for alias required: "
 480                          "can't determine dependence between %T and %T\n",
 481                          DR_REF (dra), DR_REF (drb));
 482
 483       /* Add to list of ddrs that need to be tested at run-time.  */
 484       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 485     }
 486
 487   /* Known data dependence.  */
 488   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 489     {
 490       if (apply_safelen ())
 491         return opt_result::success ();
 492
 493       if (dump_enabled_p ())
 494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 495                          "versioning for alias required: "
 496                          "bad dist vector for %T and %T\n",
 497                          DR_REF (dra), DR_REF (drb));
 498       /* Add to list of ddrs that need to be tested at run-time.  */
 499       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 500     }
 501
 502   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 503
 504   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 505       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 506                                                 loop_depth, max_vf))
 507     return opt_result::success ();
 508
 509   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 510     {
 511       int dist = dist_v[loop_depth];
 512
 513       if (dump_enabled_p ())
 514         dump_printf_loc (MSG_NOTE, vect_location,
 515                          "dependence distance  = %d.\n", dist);
 516
 517       if (dist == 0)
 518         {
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance == 0 between %T and %T\n",
 522                              DR_REF (dra), DR_REF (drb));
 523
 524           /* When we perform grouped accesses and perform implicit CSE
 525              by detecting equal accesses and doing disambiguation with
 526              runtime alias tests like for
 527                 .. = a[i];
 528                 .. = a[i+1];
 529                 a[i] = ..;
 530                 a[i+1] = ..;
 531                 *p = ..;
 532                 .. = a[i];
 533                 .. = a[i+1];
 534              where we will end up loading { a[i], a[i+1] } once, make
 535              sure that inserting group loads before the first load and
 536              stores after the last store will do the right thing.
 537              Similar for groups like
 538                 a[i] = ...;
 539                 ... = a[i];
 540                 a[i+1] = ...;
 541              where loads from the group interleave with the store.  */
 542           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 543             return opt_result::failure_at (stmtinfo_a->stmt,
 544                                            "READ_WRITE dependence"
 545                                            " in interleaving.\n");
 546
 547           if (loop->safelen < 2)
 548             {
 549               tree indicator = dr_zero_step_indicator (dra);
 550               if (!indicator || integer_zerop (indicator))
 551                 return opt_result::failure_at (stmtinfo_a->stmt,
 552                                                "access also has a zero step\n");
 553               else if (TREE_CODE (indicator) != INTEGER_CST)
 554                 vect_check_nonzero_value (loop_vinfo, indicator);
 555             }
 556           continue;
 557         }
 558
 559       if (dist > 0 && DDR_REVERSED_P (ddr))
 560         {
 561           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 562              reversed (to make distance vector positive), and the actual
 563              distance is negative.  */
 564           if (dump_enabled_p ())
 565             dump_printf_loc (MSG_NOTE, vect_location,
 566                              "dependence distance negative.\n");
 567           /* When doing outer loop vectorization, we need to check if there is
 568              a backward dependence at the inner loop level if the dependence
 569              at the outer loop is reversed.  See PR81740.  */
 570           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 571               || nested_in_vect_loop_p (loop, stmtinfo_b))
 572             {
 573               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 574                                                          DDR_LOOP_NEST (ddr));
 575               if (dist_v[inner_depth] < 0)
 576                 return opt_result::failure_at (stmtinfo_a->stmt,
 577                                                "not vectorized, dependence "
 578                                                "between data-refs %T and %T\n",
 579                                                DR_REF (dra), DR_REF (drb));
 580             }
 581           /* Record a negative dependence distance to later limit the
 582              amount of stmt copying / unrolling we can perform.
 583              Only need to handle read-after-write dependence.  */
 584           if (DR_IS_READ (drb)
 585               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 586                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 587             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 588           continue;
 589         }
 590
 591       unsigned int abs_dist = abs (dist);
 592       if (abs_dist >= 2 && abs_dist < *max_vf)
 593         {
 594           /* The dependence distance requires reduction of the maximal
 595              vectorization factor.  */
 596           *max_vf = abs_dist;
 597           if (dump_enabled_p ())
 598             dump_printf_loc (MSG_NOTE, vect_location,
 599                              "adjusting maximal vectorization factor to %i\n",
 600                              *max_vf);
 601         }
 602
 603       if (abs_dist >= *max_vf)
 604         {
 605           /* Dependence distance does not create dependence, as far as
 606              vectorization is concerned, in this case.  */
 607           if (dump_enabled_p ())
 608             dump_printf_loc (MSG_NOTE, vect_location,
 609                              "dependence distance >= VF.\n");
 610           continue;
 611         }
 612
 613       return opt_result::failure_at (stmtinfo_a->stmt,
 614                                      "not vectorized, possible dependence "
 615                                      "between data-refs %T and %T\n",
 616                                      DR_REF (dra), DR_REF (drb));
 617     }
 618
 619   return opt_result::success ();
 620 }
 621
 622 /* Funcion vect_analyze_early_break_dependences.
 623
 624    Examime all the data references in the loop and make sure that if we have
 625    mulitple exits that we are able to safely move stores such that they become
 626    safe for vectorization.  The function also calculates the place where to move
 627    the instructions to and computes what the new vUSE chain should be.
 628
 629    This works in tandem with the CFG that will be produced by
 630    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 631
 632    This function tries to validate whether an early break vectorization
 633    is possible for the current instruction sequence. Returns True i
 634    possible, otherwise False.
 635
 636    Requirements:
 637      - Any memory access must be to a fixed size buffer.
 638      - There must not be any loads and stores to the same object.
 639      - Multiple loads are allowed as long as they don't alias.
 640
 641    NOTE:
 642      This implemementation is very conservative. Any overlappig loads/stores
 643      that take place before the early break statement gets rejected aside from
 644      WAR dependencies.
 645
 646      i.e.:
 647
 648         a[i] = 8
 649         c = a[i]
 650         if (b[i])
 651           ...
 652
 653         is not allowed, but
 654
 655         c = a[i]
 656         a[i] = 8
 657         if (b[i])
 658           ...
 659
 660         is which is the common case.  */
 661
 662 static opt_result
 663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 664 {
 665   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 666
 667   /* List of all load data references found during traversal.  */
 668   auto_vec<data_reference *> bases;
 669   basic_block dest_bb = NULL;
 670
 671   hash_set <gimple *> visited;
 672   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 673   class loop *loop_nest = loop_outer (loop);
 674
 675   if (dump_enabled_p ())
 676     dump_printf_loc (MSG_NOTE, vect_location,
 677                      "loop contains multiple exits, analyzing"
 678                      " statement dependencies.\n");
 679
 680   /* Since we don't support general control flow, the location we'll move the
 681      side-effects to is always the latch connected exit.  When we support
 682      general control flow we can do better but for now this is fine.  */
 683   dest_bb = single_pred (loop->latch);
 684   basic_block bb = dest_bb;
 685
 686   do
 687     {
 688       /* If the destination block is also the header then we have nothing to do.  */
 689       if (!single_pred_p (bb))
 690         continue;
 691
 692       bb = single_pred (bb);
 693       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 694
 695       /* Now analyze all the remaining statements and try to determine which
 696          instructions are allowed/needed to be moved.  */
 697       while (!gsi_end_p (gsi))
 698         {
 699           gimple *stmt = gsi_stmt (gsi);
 700           gsi_prev (&gsi);
 701           if (!gimple_has_ops (stmt)
 702               || is_gimple_debug (stmt))
 703             continue;
 704
 705           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 706           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 707           if (!dr_ref)
 708             continue;
 709
 710           /* We currently only support statically allocated objects due to
 711              not having first-faulting loads support or peeling for
 712              alignment support.  Compute the size of the referenced object
 713              (it could be dynamically allocated).  */
 714           tree obj = DR_BASE_ADDRESS (dr_ref);
 715           if (!obj || TREE_CODE (obj) != ADDR_EXPR)
 716             {
 717               if (dump_enabled_p ())
 718                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                                  "early breaks only supported on statically"
 720                                  " allocated objects.\n");
 721               return opt_result::failure_at (stmt,
 722                                  "can't safely apply code motion to "
 723                                  "dependencies of %G to vectorize "
 724                                  "the early exit.\n", stmt);
 725             }
 726
 727           tree refop = TREE_OPERAND (obj, 0);
 728           tree refbase = get_base_address (refop);
 729           if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
 730               || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
 731             {
 732               if (dump_enabled_p ())
 733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 734                                  "early breaks only supported on"
 735                                  " statically allocated objects.\n");
 736               return opt_result::failure_at (stmt,
 737                                  "can't safely apply code motion to "
 738                                  "dependencies of %G to vectorize "
 739                                  "the early exit.\n", stmt);
 740             }
 741
 742           /* Check if vector accesses to the object will be within bounds.
 743              must be a constant or assume loop will be versioned or niters
 744              bounded by VF so accesses are within range.  */
 745           if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 749                                  "early breaks not supported: vectorization "
 750                                  "would %s beyond size of obj.",
 751                                  DR_IS_READ (dr_ref) ? "read" : "write");
 752               return opt_result::failure_at (stmt,
 753                                  "can't safely apply code motion to "
 754                                  "dependencies of %G to vectorize "
 755                                  "the early exit.\n", stmt);
 756             }
 757
 758           if (DR_IS_READ (dr_ref))
 759             bases.safe_push (dr_ref);
 760           else if (DR_IS_WRITE (dr_ref))
 761             {
 762               /* We are moving writes down in the CFG.  To be sure that this
 763                  is valid after vectorization we have to check all the loads
 764                  we are sinking the stores past to see if any of them may
 765                  alias or are the same object.
 766
 767                  Same objects will not be an issue because unless the store
 768                  is marked volatile the value can be forwarded.  If the
 769                  store is marked volatile we don't vectorize the loop
 770                  anyway.
 771
 772                  That leaves the check for aliasing.  We don't really need
 773                  to care about the stores aliasing with each other since the
 774                  stores are moved in order so the effects are still observed
 775                  correctly.  This leaves the check for WAR dependencies
 776                  which we would be introducing here if the DR can alias.
 777                  The check is quadratic in loads/stores but I have not found
 778                  a better API to do this.  I believe all loads and stores
 779                  must be checked.  We also must check them when we
 780                  encountered the store, since we don't care about loads past
 781                  the store.  */
 782
 783               for (auto dr_read : bases)
 784                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 785                   {
 786                     if (dump_enabled_p ())
 787                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 788                                        vect_location,
 789                                        "early breaks not supported: "
 790                                        "overlapping loads and stores "
 791                                        "found before the break "
 792                                        "statement.\n");
 793
 794                     return opt_result::failure_at (stmt,
 795                              "can't safely apply code motion to dependencies"
 796                              " to vectorize the early exit. %G may alias with"
 797                              " %G\n", stmt, dr_read->stmt);
 798                   }
 799             }
 800
 801           if (gimple_vdef (stmt))
 802             {
 803               if (dump_enabled_p ())
 804                 dump_printf_loc (MSG_NOTE, vect_location,
 805                                  "==> recording stmt %G", stmt);
 806
 807               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 808             }
 809           else if (gimple_vuse (stmt))
 810             {
 811               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 812               if (dump_enabled_p ())
 813                 dump_printf_loc (MSG_NOTE, vect_location,
 814                                  "marked statement for vUSE update: %G", stmt);
 815             }
 816         }
 817     }
 818   while (bb != loop->header);
 819
 820   /* We don't allow outer -> inner loop transitions which should have been
 821      trapped already during loop form analysis.  */
 822   gcc_assert (dest_bb->loop_father == loop);
 823
 824   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 825
 826   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 827     {
 828       /* All uses shall be updated to that of the first load.  Entries are
 829          stored in reverse order.  */
 830       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 831       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 832         {
 833           if (dump_enabled_p ())
 834           dump_printf_loc (MSG_NOTE, vect_location,
 835                            "will update use: %T, mem_ref: %G", vuse, g);
 836         }
 837     }
 838
 839   if (dump_enabled_p ())
 840     dump_printf_loc (MSG_NOTE, vect_location,
 841                      "recorded statements to be moved to BB %d\n",
 842                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 843
 844   return opt_result::success ();
 845 }
 846
 847 /* Function vect_analyze_data_ref_dependences.
 848
 849    Examine all the data references in the loop, and make sure there do not
 850    exist any data dependences between them.  Set *MAX_VF according to
 851    the maximum vectorization factor the data dependences allow.  */
 852
 853 opt_result
 854 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 855                                    unsigned int *max_vf)
 856 {
 857   unsigned int i;
 858   struct data_dependence_relation *ddr;
 859
 860   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 861
 862   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 863     {
 864       LOOP_VINFO_DDRS (loop_vinfo)
 865         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 866                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 867       /* We do not need read-read dependences.  */
 868       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 869                                           &LOOP_VINFO_DDRS (loop_vinfo),
 870                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 871                                           false);
 872       gcc_assert (res);
 873     }
 874
 875   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 876
 877   /* For epilogues we either have no aliases or alias versioning
 878      was applied to original loop.  Therefore we may just get max_vf
 879      using VF of original loop.  */
 880   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 881     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 882   else
 883     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 884       {
 885         opt_result res
 886           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 887         if (!res)
 888           return res;
 889       }
 890
 891   /* If we have early break statements in the loop, check to see if they
 892      are of a form we can vectorizer.  */
 893   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 894     return vect_analyze_early_break_dependences (loop_vinfo);
 895
 896   return opt_result::success ();
 897 }
 898
 899
 900 /* Function vect_slp_analyze_data_ref_dependence.
 901
 902    Return TRUE if there (might) exist a dependence between a memory-reference
 903    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 904    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 905    according to the data dependence.  */
 906
 907 static bool
 908 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 909                                       struct data_dependence_relation *ddr)
 910 {
 911   struct data_reference *dra = DDR_A (ddr);
 912   struct data_reference *drb = DDR_B (ddr);
 913   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 914   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 915
 916   /* We need to check dependences of statements marked as unvectorizable
 917      as well, they still can prohibit vectorization.  */
 918
 919   /* Independent data accesses.  */
 920   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 921     return false;
 922
 923   if (dra == drb)
 924     return false;
 925
 926   /* Read-read is OK.  */
 927   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 928     return false;
 929
 930   /* If dra and drb are part of the same interleaving chain consider
 931      them independent.  */
 932   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 933       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 934           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 935     return false;
 936
 937   /* Unknown data dependence.  */
 938   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 939     {
 940       if  (dump_enabled_p ())
 941         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 942                          "can't determine dependence between %T and %T\n",
 943                          DR_REF (dra), DR_REF (drb));
 944     }
 945   else if (dump_enabled_p ())
 946     dump_printf_loc (MSG_NOTE, vect_location,
 947                      "determined dependence between %T and %T\n",
 948                      DR_REF (dra), DR_REF (drb));
 949
 950   return true;
 951 }
 952
 953
 954 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 955
 956 static bool
 957 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 958 {
 959   /* This walks over all stmts involved in the SLP store done
 960      in NODE verifying we can sink them up to the last stmt in the
 961      group.  */
 962   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 963   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 964
 965   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 966     {
 967       stmt_vec_info access_info
 968         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 969       if (access_info == last_access_info)
 970         continue;
 971       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 972       ao_ref ref;
 973       bool ref_initialized_p = false;
 974       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 975            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 976         {
 977           gimple *stmt = gsi_stmt (gsi);
 978           if (! gimple_vuse (stmt))
 979             continue;
 980
 981           /* If we couldn't record a (single) data reference for this
 982              stmt we have to resort to the alias oracle.  */
 983           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 984           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 985           if (!dr_b)
 986             {
 987               /* We are moving a store - this means
 988                  we cannot use TBAA for disambiguation.  */
 989               if (!ref_initialized_p)
 990                 ao_ref_init (&ref, DR_REF (dr_a));
 991               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 992                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 993                 return false;
 994               continue;
 995             }
 996
 997           gcc_assert (!gimple_visited_p (stmt));
 998
 999           ddr_p ddr = initialize_data_dependence_relation (dr_a,
1000                                                            dr_b, vNULL);
1001           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1002           free_dependence_relation (ddr);
1003           if (dependent)
1004             return false;
1005         }
1006     }
1007   return true;
1008 }
1009
1010 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1011    contain the vector of scalar stores of this instance if we are
1012    disambiguating the loads.  */
1013
1014 static bool
1015 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1016                                    vec<stmt_vec_info> stores,
1017                                    stmt_vec_info last_store_info)
1018 {
1019   /* This walks over all stmts involved in the SLP load done
1020      in NODE verifying we can hoist them up to the first stmt in the
1021      group.  */
1022   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1023   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1024
1025   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1026     {
1027       stmt_vec_info access_info
1028         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1029       if (access_info == first_access_info)
1030         continue;
1031       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1032       ao_ref ref;
1033       bool ref_initialized_p = false;
1034       hash_set<stmt_vec_info> grp_visited;
1035       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1036            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1037         {
1038           gimple *stmt = gsi_stmt (gsi);
1039           if (! gimple_vdef (stmt))
1040             continue;
1041
1042           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1043
1044           /* If we run into a store of this same instance (we've just
1045              marked those) then delay dependence checking until we run
1046              into the last store because this is where it will have
1047              been sunk to (and we verified that we can do that already).  */
1048           if (gimple_visited_p (stmt))
1049             {
1050               if (stmt_info != last_store_info)
1051                 continue;
1052
1053               for (stmt_vec_info &store_info : stores)
1054                 {
1055                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1056                   ddr_p ddr = initialize_data_dependence_relation
1057                                 (dr_a, store_dr, vNULL);
1058                   bool dependent
1059                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1060                   free_dependence_relation (ddr);
1061                   if (dependent)
1062                     return false;
1063                 }
1064               continue;
1065             }
1066
1067           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1068             {
1069               /* We are hoisting a load - this means we can use TBAA for
1070                  disambiguation.  */
1071               if (!ref_initialized_p)
1072                 ao_ref_init (&ref, DR_REF (dr_a));
1073               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1074                 {
1075                   /* If we couldn't record a (single) data reference for this
1076                      stmt we have to give up now.  */
1077                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1078                   if (!dr_b)
1079                     return false;
1080                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1081                                                                    dr_b, vNULL);
1082                   bool dependent
1083                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1084                   free_dependence_relation (ddr);
1085                   if (dependent)
1086                     return false;
1087                 }
1088               /* No dependence.  */
1089               return true;
1090             };
1091           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1092             {
1093               /* When we run into a store group we have to honor
1094                  that earlier stores might be moved here.  We don't
1095                  know exactly which and where to since we lack a
1096                  back-mapping from DR to SLP node, so assume all
1097                  earlier stores are sunk here.  It's enough to
1098                  consider the last stmt of a group for this.
1099                  ???  Both this and the fact that we disregard that
1100                  the conflicting instance might be removed later
1101                  is overly conservative.  */
1102               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1103                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1104                      store_info != NULL;
1105                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1106                   if ((store_info == stmt_info
1107                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1108                       && !check_hoist (store_info))
1109                     return false;
1110             }
1111           else
1112             {
1113               if (!check_hoist (stmt_info))
1114                 return false;
1115             }
1116         }
1117     }
1118   return true;
1119 }
1120
1121
1122 /* Function vect_analyze_data_ref_dependences.
1123
1124    Examine all the data references in the basic-block, and make sure there
1125    do not exist any data dependences between them.  Set *MAX_VF according to
1126    the maximum vectorization factor the data dependences allow.  */
1127
1128 bool
1129 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1130 {
1131   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1132
1133   /* The stores of this instance are at the root of the SLP tree.  */
1134   slp_tree store = NULL;
1135   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1136     store = SLP_INSTANCE_TREE (instance);
1137
1138   /* Verify we can sink stores to the vectorized stmt insert location.  */
1139   stmt_vec_info last_store_info = NULL;
1140   if (store)
1141     {
1142       if (! vect_slp_analyze_store_dependences (vinfo, store))
1143         return false;
1144
1145       /* Mark stores in this instance and remember the last one.  */
1146       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1147       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1148         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1149     }
1150
1151   bool res = true;
1152
1153   /* Verify we can sink loads to the vectorized stmt insert location,
1154      special-casing stores of this instance.  */
1155   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1156     if (! vect_slp_analyze_load_dependences (vinfo, load,
1157                                              store
1158                                              ? SLP_TREE_SCALAR_STMTS (store)
1159                                              : vNULL, last_store_info))
1160       {
1161         res = false;
1162         break;
1163       }
1164
1165   /* Unset the visited flag.  */
1166   if (store)
1167     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1168       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1169
1170   return res;
1171 }
1172
1173 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1174    applied.  */
1175
1176 int
1177 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1178 {
1179   HOST_WIDE_INT diff = 0;
1180   /* Alignment is only analyzed for the first element of a DR group,
1181      use that but adjust misalignment by the offset of the access.  */
1182   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1183     {
1184       dr_vec_info *first_dr
1185         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1186       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1187          INTEGER_CSTs and the first element in the group has the lowest
1188          address.  */
1189       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1190               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1191       gcc_assert (diff >= 0);
1192       dr_info = first_dr;
1193     }
1194
1195   int misalign = dr_info->misalignment;
1196   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1197   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1198     return misalign;
1199
1200   /* If the access is only aligned for a vector type with smaller alignment
1201      requirement the access has unknown misalignment.  */
1202   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1203                 targetm.vectorize.preferred_vector_alignment (vectype)))
1204     return DR_MISALIGNMENT_UNKNOWN;
1205
1206   /* Apply the offset from the DR group start and the externally supplied
1207      offset which can for example result from a negative stride access.  */
1208   poly_int64 misalignment = misalign + diff + offset;
1209
1210   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1211      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1212   unsigned HOST_WIDE_INT target_alignment_c
1213     = dr_info->target_alignment.to_constant ();
1214   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1215     return DR_MISALIGNMENT_UNKNOWN;
1216   return misalign;
1217 }
1218
1219 /* Record the base alignment guarantee given by DRB, which occurs
1220    in STMT_INFO.  */
1221
1222 static void
1223 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1224                             innermost_loop_behavior *drb)
1225 {
1226   bool existed;
1227   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1228     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1229   if (!existed || entry.second->base_alignment < drb->base_alignment)
1230     {
1231       entry = std::make_pair (stmt_info, drb);
1232       if (dump_enabled_p ())
1233         dump_printf_loc (MSG_NOTE, vect_location,
1234                          "recording new base alignment for %T\n"
1235                          "  alignment:    %d\n"
1236                          "  misalignment: %d\n"
1237                          "  based on:     %G",
1238                          drb->base_address,
1239                          drb->base_alignment,
1240                          drb->base_misalignment,
1241                          stmt_info->stmt);
1242     }
1243 }
1244
1245 /* If the region we're going to vectorize is reached, all unconditional
1246    data references occur at least once.  We can therefore pool the base
1247    alignment guarantees from each unconditional reference.  Do this by
1248    going through all the data references in VINFO and checking whether
1249    the containing statement makes the reference unconditionally.  If so,
1250    record the alignment of the base address in VINFO so that it can be
1251    used for all other references with the same base.  */
1252
1253 void
1254 vect_record_base_alignments (vec_info *vinfo)
1255 {
1256   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1257   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1258   for (data_reference *dr : vinfo->shared->datarefs)
1259     {
1260       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1261       stmt_vec_info stmt_info = dr_info->stmt;
1262       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1263           && STMT_VINFO_VECTORIZABLE (stmt_info)
1264           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1265         {
1266           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1267
1268           /* If DR is nested in the loop that is being vectorized, we can also
1269              record the alignment of the base wrt the outer loop.  */
1270           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1271             vect_record_base_alignment
1272               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1273         }
1274     }
1275 }
1276
1277 /* Function vect_compute_data_ref_alignment
1278
1279    Compute the misalignment of the data reference DR_INFO when vectorizing
1280    with VECTYPE.
1281
1282    Output:
1283    1. initialized misalignment info for DR_INFO
1284
1285    FOR NOW: No analysis is actually performed. Misalignment is calculated
1286    only for trivial cases. TODO.  */
1287
1288 static void
1289 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1290                                  tree vectype)
1291 {
1292   stmt_vec_info stmt_info = dr_info->stmt;
1293   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1294   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1295   class loop *loop = NULL;
1296   tree ref = DR_REF (dr_info->dr);
1297
1298   if (dump_enabled_p ())
1299     dump_printf_loc (MSG_NOTE, vect_location,
1300                      "vect_compute_data_ref_alignment:\n");
1301
1302   if (loop_vinfo)
1303     loop = LOOP_VINFO_LOOP (loop_vinfo);
1304
1305   /* Initialize misalignment to unknown.  */
1306   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1307
1308   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1309     return;
1310
1311   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1312   bool step_preserves_misalignment_p;
1313
1314   poly_uint64 vector_alignment
1315     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1316                  BITS_PER_UNIT);
1317   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1318
1319   /* If the main loop has peeled for alignment we have no way of knowing
1320      whether the data accesses in the epilogues are aligned.  We can't at
1321      compile time answer the question whether we have entered the main loop or
1322      not.  Fixes PR 92351.  */
1323   if (loop_vinfo)
1324     {
1325       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1326       if (orig_loop_vinfo
1327           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1328         return;
1329     }
1330
1331   unsigned HOST_WIDE_INT vect_align_c;
1332   if (!vector_alignment.is_constant (&vect_align_c))
1333     return;
1334
1335   /* No step for BB vectorization.  */
1336   if (!loop)
1337     {
1338       gcc_assert (integer_zerop (drb->step));
1339       step_preserves_misalignment_p = true;
1340     }
1341
1342   /* In case the dataref is in an inner-loop of the loop that is being
1343      vectorized (LOOP), we use the base and misalignment information
1344      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1345      stays the same throughout the execution of the inner-loop, which is why
1346      we have to check that the stride of the dataref in the inner-loop evenly
1347      divides by the vector alignment.  */
1348   else if (nested_in_vect_loop_p (loop, stmt_info))
1349     {
1350       step_preserves_misalignment_p
1351         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1352
1353       if (dump_enabled_p ())
1354         {
1355           if (step_preserves_misalignment_p)
1356             dump_printf_loc (MSG_NOTE, vect_location,
1357                              "inner step divides the vector alignment.\n");
1358           else
1359             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1360                              "inner step doesn't divide the vector"
1361                              " alignment.\n");
1362         }
1363     }
1364
1365   /* Similarly we can only use base and misalignment information relative to
1366      an innermost loop if the misalignment stays the same throughout the
1367      execution of the loop.  As above, this is the case if the stride of
1368      the dataref evenly divides by the alignment.  */
1369   else
1370     {
1371       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1372       step_preserves_misalignment_p
1373         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1374
1375       if (!step_preserves_misalignment_p && dump_enabled_p ())
1376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                          "step doesn't divide the vector alignment.\n");
1378     }
1379
1380   unsigned int base_alignment = drb->base_alignment;
1381   unsigned int base_misalignment = drb->base_misalignment;
1382
1383   /* Calculate the maximum of the pooled base address alignment and the
1384      alignment that we can compute for DR itself.  */
1385   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1386     = base_alignments->get (drb->base_address);
1387   if (entry
1388       && base_alignment < (*entry).second->base_alignment
1389       && (loop_vinfo
1390           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1391                               gimple_bb (entry->first->stmt))
1392               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1393                   || (entry->first->dr_aux.group <= dr_info->group)))))
1394     {
1395       base_alignment = entry->second->base_alignment;
1396       base_misalignment = entry->second->base_misalignment;
1397     }
1398
1399   if (drb->offset_alignment < vect_align_c
1400       || !step_preserves_misalignment_p
1401       /* We need to know whether the step wrt the vectorized loop is
1402          negative when computing the starting misalignment below.  */
1403       || TREE_CODE (drb->step) != INTEGER_CST)
1404     {
1405       if (dump_enabled_p ())
1406         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407                          "Unknown alignment for access: %T\n", ref);
1408       return;
1409     }
1410
1411   if (base_alignment < vect_align_c)
1412     {
1413       unsigned int max_alignment;
1414       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1415       if (max_alignment < vect_align_c
1416           || !vect_can_force_dr_alignment_p (base,
1417                                              vect_align_c * BITS_PER_UNIT))
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_NOTE, vect_location,
1421                              "can't force alignment of ref: %T\n", ref);
1422           return;
1423         }
1424
1425       /* Force the alignment of the decl.
1426          NOTE: This is the only change to the code we make during
1427          the analysis phase, before deciding to vectorize the loop.  */
1428       if (dump_enabled_p ())
1429         dump_printf_loc (MSG_NOTE, vect_location,
1430                          "force alignment of %T\n", ref);
1431
1432       dr_info->base_decl = base;
1433       dr_info->base_misaligned = true;
1434       base_misalignment = 0;
1435     }
1436   poly_int64 misalignment
1437     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1438
1439   unsigned int const_misalignment;
1440   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1441     {
1442       if (dump_enabled_p ())
1443         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1444                          "Non-constant misalignment for access: %T\n", ref);
1445       return;
1446     }
1447
1448   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1449
1450   if (dump_enabled_p ())
1451     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452                      "misalign = %d bytes of ref %T\n",
1453                      const_misalignment, ref);
1454
1455   return;
1456 }
1457
1458 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1459    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1460    is made aligned via peeling.  */
1461
1462 static bool
1463 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1464                                          dr_vec_info *dr_peel_info)
1465 {
1466   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1467                   DR_TARGET_ALIGNMENT (dr_info)))
1468     {
1469       poly_offset_int diff
1470         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1471            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1472       if (known_eq (diff, 0)
1473           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1474         return true;
1475     }
1476   return false;
1477 }
1478
1479 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1480    aligned via peeling.  */
1481
1482 static bool
1483 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1484                                  dr_vec_info *dr_peel_info)
1485 {
1486   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1487                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1488       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1489                            DR_OFFSET (dr_peel_info->dr), 0)
1490       || !operand_equal_p (DR_STEP (dr_info->dr),
1491                            DR_STEP (dr_peel_info->dr), 0))
1492     return false;
1493
1494   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1495 }
1496
1497 /* Compute the value for dr_info->misalign so that the access appears
1498    aligned.  This is used by peeling to compensate for dr_misalignment
1499    applying the offset for negative step.  */
1500
1501 int
1502 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1503 {
1504   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1505     return 0;
1506
1507   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1508   poly_int64 misalignment
1509     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1510        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1511
1512   unsigned HOST_WIDE_INT target_alignment_c;
1513   int misalign;
1514   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1515       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1516     return DR_MISALIGNMENT_UNKNOWN;
1517   return misalign;
1518 }
1519
1520 /* Function vect_update_misalignment_for_peel.
1521    Sets DR_INFO's misalignment
1522    - to 0 if it has the same alignment as DR_PEEL_INFO,
1523    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1524    - to -1 (unknown) otherwise.
1525
1526    DR_INFO - the data reference whose misalignment is to be adjusted.
1527    DR_PEEL_INFO - the data reference whose misalignment is being made
1528                   zero in the vector loop by the peel.
1529    NPEEL - the number of iterations in the peel loop if the misalignment
1530            of DR_PEEL_INFO is known at compile time.  */
1531
1532 static void
1533 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1534                                    dr_vec_info *dr_peel_info, int npeel)
1535 {
1536   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1537   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1538     {
1539       SET_DR_MISALIGNMENT (dr_info,
1540                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1541       return;
1542     }
1543
1544   unsigned HOST_WIDE_INT alignment;
1545   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1546       && known_alignment_for_access_p (dr_info,
1547                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1548       && known_alignment_for_access_p (dr_peel_info,
1549                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1550     {
1551       int misal = dr_info->misalignment;
1552       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1553       misal &= alignment - 1;
1554       set_dr_misalignment (dr_info, misal);
1555       return;
1556     }
1557
1558   if (dump_enabled_p ())
1559     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1560                      "to unknown (-1).\n");
1561   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1562 }
1563
1564 /* Return true if alignment is relevant for DR_INFO.  */
1565
1566 static bool
1567 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1568 {
1569   stmt_vec_info stmt_info = dr_info->stmt;
1570
1571   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1572     return false;
1573
1574   /* For interleaving, only the alignment of the first access matters.  */
1575   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1576       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1577     return false;
1578
1579   /* Scatter-gather and invariant accesses continue to address individual
1580      scalars, so vector-level alignment is irrelevant.  */
1581   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1582       || integer_zerop (DR_STEP (dr_info->dr)))
1583     return false;
1584
1585   /* Strided accesses perform only component accesses, alignment is
1586      irrelevant for them.  */
1587   if (STMT_VINFO_STRIDED_P (stmt_info)
1588       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1589     return false;
1590
1591   return true;
1592 }
1593
1594 /* Given an memory reference EXP return whether its alignment is less
1595    than its size.  */
1596
1597 static bool
1598 not_size_aligned (tree exp)
1599 {
1600   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1601     return true;
1602
1603   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1604           > get_object_alignment (exp));
1605 }
1606
1607 /* Function vector_alignment_reachable_p
1608
1609    Return true if vector alignment for DR_INFO is reachable by peeling
1610    a few loop iterations.  Return false otherwise.  */
1611
1612 static bool
1613 vector_alignment_reachable_p (dr_vec_info *dr_info)
1614 {
1615   stmt_vec_info stmt_info = dr_info->stmt;
1616   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1617
1618   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1619     {
1620       /* For interleaved access we peel only if number of iterations in
1621          the prolog loop ({VF - misalignment}), is a multiple of the
1622          number of the interleaved accesses.  */
1623       int elem_size, mis_in_elements;
1624
1625       /* FORNOW: handle only known alignment.  */
1626       if (!known_alignment_for_access_p (dr_info, vectype))
1627         return false;
1628
1629       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1630       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1631       elem_size = vector_element_size (vector_size, nelements);
1632       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1633
1634       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1635         return false;
1636     }
1637
1638   /* If misalignment is known at the compile time then allow peeling
1639      only if natural alignment is reachable through peeling.  */
1640   if (known_alignment_for_access_p (dr_info, vectype)
1641       && !aligned_access_p (dr_info, vectype))
1642     {
1643       HOST_WIDE_INT elmsize =
1644                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1645       if (dump_enabled_p ())
1646         {
1647           dump_printf_loc (MSG_NOTE, vect_location,
1648                            "data size = %wd. misalignment = %d.\n", elmsize,
1649                            dr_misalignment (dr_info, vectype));
1650         }
1651       if (dr_misalignment (dr_info, vectype) % elmsize)
1652         {
1653           if (dump_enabled_p ())
1654             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                              "data size does not divide the misalignment.\n");
1656           return false;
1657         }
1658     }
1659
1660   if (!known_alignment_for_access_p (dr_info, vectype))
1661     {
1662       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1663       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "Unknown misalignment, %snaturally aligned\n",
1667                          is_packed ? "not " : "");
1668       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1669     }
1670
1671   return true;
1672 }
1673
1674
1675 /* Calculate the cost of the memory access represented by DR_INFO.  */
1676
1677 static void
1678 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1679                            dr_alignment_support alignment_support_scheme,
1680                            int misalignment,
1681                            unsigned int *inside_cost,
1682                            unsigned int *outside_cost,
1683                            stmt_vector_for_cost *body_cost_vec,
1684                            stmt_vector_for_cost *prologue_cost_vec)
1685 {
1686   stmt_vec_info stmt_info = dr_info->stmt;
1687   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1688   int ncopies;
1689
1690   if (PURE_SLP_STMT (stmt_info))
1691     ncopies = 1;
1692   else
1693     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1694
1695   if (DR_IS_READ (dr_info->dr))
1696     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1697                         misalignment, true, inside_cost,
1698                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1699   else
1700     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1701                          misalignment, inside_cost, body_cost_vec);
1702
1703   if (dump_enabled_p ())
1704     dump_printf_loc (MSG_NOTE, vect_location,
1705                      "vect_get_data_access_cost: inside_cost = %d, "
1706                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1707 }
1708
1709
1710 typedef struct _vect_peel_info
1711 {
1712   dr_vec_info *dr_info;
1713   int npeel;
1714   unsigned int count;
1715 } *vect_peel_info;
1716
1717 typedef struct _vect_peel_extended_info
1718 {
1719   vec_info *vinfo;
1720   struct _vect_peel_info peel_info;
1721   unsigned int inside_cost;
1722   unsigned int outside_cost;
1723 } *vect_peel_extended_info;
1724
1725
1726 /* Peeling hashtable helpers.  */
1727
1728 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1729 {
1730   static inline hashval_t hash (const _vect_peel_info *);
1731   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1732 };
1733
1734 inline hashval_t
1735 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1736 {
1737   return (hashval_t) peel_info->npeel;
1738 }
1739
1740 inline bool
1741 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1742 {
1743   return (a->npeel == b->npeel);
1744 }
1745
1746
1747 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1748
1749 static void
1750 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1751                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1752                           int npeel, bool supportable_if_not_aligned)
1753 {
1754   struct _vect_peel_info elem, *slot;
1755   _vect_peel_info **new_slot;
1756
1757   elem.npeel = npeel;
1758   slot = peeling_htab->find (&elem);
1759   if (slot)
1760     slot->count++;
1761   else
1762     {
1763       slot = XNEW (struct _vect_peel_info);
1764       slot->npeel = npeel;
1765       slot->dr_info = dr_info;
1766       slot->count = 1;
1767       new_slot = peeling_htab->find_slot (slot, INSERT);
1768       *new_slot = slot;
1769     }
1770
1771   /* If this DR is not supported with unknown misalignment then bias
1772      this slot when the cost model is disabled.  */
1773   if (!supportable_if_not_aligned
1774       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1775     slot->count += VECT_MAX_COST;
1776 }
1777
1778
1779 /* Traverse peeling hash table to find peeling option that aligns maximum
1780    number of data accesses.  */
1781
1782 int
1783 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1784                                      _vect_peel_extended_info *max)
1785 {
1786   vect_peel_info elem = *slot;
1787
1788   if (elem->count > max->peel_info.count
1789       || (elem->count == max->peel_info.count
1790           && max->peel_info.npeel > elem->npeel))
1791     {
1792       max->peel_info.npeel = elem->npeel;
1793       max->peel_info.count = elem->count;
1794       max->peel_info.dr_info = elem->dr_info;
1795     }
1796
1797   return 1;
1798 }
1799
1800 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1801    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1802    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1803    after peeling.  */
1804
1805 static void
1806 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1807                                 dr_vec_info *dr0_info,
1808                                 unsigned int *inside_cost,
1809                                 unsigned int *outside_cost,
1810                                 stmt_vector_for_cost *body_cost_vec,
1811                                 stmt_vector_for_cost *prologue_cost_vec,
1812                                 unsigned int npeel)
1813 {
1814   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1815
1816   bool dr0_alignment_known_p
1817     = (dr0_info
1818        && known_alignment_for_access_p (dr0_info,
1819                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1820
1821   for (data_reference *dr : datarefs)
1822     {
1823       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1824       if (!vect_relevant_for_alignment_p (dr_info))
1825         continue;
1826
1827       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1828       dr_alignment_support alignment_support_scheme;
1829       int misalignment;
1830       unsigned HOST_WIDE_INT alignment;
1831
1832       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1833                                             size_zero_node) < 0;
1834       poly_int64 off = 0;
1835       if (negative)
1836         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1837                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1838
1839       if (npeel == 0)
1840         misalignment = dr_misalignment (dr_info, vectype, off);
1841       else if (dr_info == dr0_info
1842                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1843         misalignment = 0;
1844       else if (!dr0_alignment_known_p
1845                || !known_alignment_for_access_p (dr_info, vectype)
1846                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1847         misalignment = DR_MISALIGNMENT_UNKNOWN;
1848       else
1849         {
1850           misalignment = dr_misalignment (dr_info, vectype, off);
1851           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1852           misalignment &= alignment - 1;
1853         }
1854       alignment_support_scheme
1855         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1856                                          misalignment);
1857
1858       vect_get_data_access_cost (loop_vinfo, dr_info,
1859                                  alignment_support_scheme, misalignment,
1860                                  inside_cost, outside_cost,
1861                                  body_cost_vec, prologue_cost_vec);
1862     }
1863 }
1864
1865 /* Traverse peeling hash table and calculate cost for each peeling option.
1866    Find the one with the lowest cost.  */
1867
1868 int
1869 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1870                                    _vect_peel_extended_info *min)
1871 {
1872   vect_peel_info elem = *slot;
1873   int dummy;
1874   unsigned int inside_cost = 0, outside_cost = 0;
1875   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1876   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1877                        epilogue_cost_vec;
1878
1879   prologue_cost_vec.create (2);
1880   body_cost_vec.create (2);
1881   epilogue_cost_vec.create (2);
1882
1883   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1884                                   &outside_cost, &body_cost_vec,
1885                                   &prologue_cost_vec, elem->npeel);
1886
1887   body_cost_vec.release ();
1888
1889   outside_cost += vect_get_known_peeling_cost
1890     (loop_vinfo, elem->npeel, &dummy,
1891      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1892      &prologue_cost_vec, &epilogue_cost_vec);
1893
1894   /* Prologue and epilogue costs are added to the target model later.
1895      These costs depend only on the scalar iteration cost, the
1896      number of peeling iterations finally chosen, and the number of
1897      misaligned statements.  So discard the information found here.  */
1898   prologue_cost_vec.release ();
1899   epilogue_cost_vec.release ();
1900
1901   if (inside_cost < min->inside_cost
1902       || (inside_cost == min->inside_cost
1903           && outside_cost < min->outside_cost))
1904     {
1905       min->inside_cost = inside_cost;
1906       min->outside_cost = outside_cost;
1907       min->peel_info.dr_info = elem->dr_info;
1908       min->peel_info.npeel = elem->npeel;
1909       min->peel_info.count = elem->count;
1910     }
1911
1912   return 1;
1913 }
1914
1915
1916 /* Choose best peeling option by traversing peeling hash table and either
1917    choosing an option with the lowest cost (if cost model is enabled) or the
1918    option that aligns as many accesses as possible.  */
1919
1920 static struct _vect_peel_extended_info
1921 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1922                                        loop_vec_info loop_vinfo)
1923 {
1924    struct _vect_peel_extended_info res;
1925
1926    res.peel_info.dr_info = NULL;
1927    res.vinfo = loop_vinfo;
1928
1929    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1930      {
1931        res.inside_cost = INT_MAX;
1932        res.outside_cost = INT_MAX;
1933        peeling_htab->traverse <_vect_peel_extended_info *,
1934                                vect_peeling_hash_get_lowest_cost> (&res);
1935      }
1936    else
1937      {
1938        res.peel_info.count = 0;
1939        peeling_htab->traverse <_vect_peel_extended_info *,
1940                                vect_peeling_hash_get_most_frequent> (&res);
1941        res.inside_cost = 0;
1942        res.outside_cost = 0;
1943      }
1944
1945    return res;
1946 }
1947
1948 /* Return true if the new peeling NPEEL is supported.  */
1949
1950 static bool
1951 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1952                           unsigned npeel)
1953 {
1954   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1955   enum dr_alignment_support supportable_dr_alignment;
1956
1957   bool dr0_alignment_known_p
1958     = known_alignment_for_access_p (dr0_info,
1959                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1960
1961   /* Ensure that all data refs can be vectorized after the peel.  */
1962   for (data_reference *dr : datarefs)
1963     {
1964       if (dr == dr0_info->dr)
1965         continue;
1966
1967       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1968       if (!vect_relevant_for_alignment_p (dr_info)
1969           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1970         continue;
1971
1972       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1973       int misalignment;
1974       unsigned HOST_WIDE_INT alignment;
1975       if (!dr0_alignment_known_p
1976           || !known_alignment_for_access_p (dr_info, vectype)
1977           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1978         misalignment = DR_MISALIGNMENT_UNKNOWN;
1979       else
1980         {
1981           misalignment = dr_misalignment (dr_info, vectype);
1982           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1983           misalignment &= alignment - 1;
1984         }
1985       supportable_dr_alignment
1986         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1987                                          misalignment);
1988       if (supportable_dr_alignment == dr_unaligned_unsupported)
1989         return false;
1990     }
1991
1992   return true;
1993 }
1994
1995 /* Compare two data-references DRA and DRB to group them into chunks
1996    with related alignment.  */
1997
1998 static int
1999 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2000 {
2001   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2002   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2003   int cmp;
2004
2005   /* Stabilize sort.  */
2006   if (dra == drb)
2007     return 0;
2008
2009   /* Ordering of DRs according to base.  */
2010   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2011                                DR_BASE_ADDRESS (drb));
2012   if (cmp != 0)
2013     return cmp;
2014
2015   /* And according to DR_OFFSET.  */
2016   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2017   if (cmp != 0)
2018     return cmp;
2019
2020   /* And after step.  */
2021   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2022   if (cmp != 0)
2023     return cmp;
2024
2025   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2026   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2027   if (cmp == 0)
2028     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2029   return cmp;
2030 }
2031
2032 /* Function vect_enhance_data_refs_alignment
2033
2034    This pass will use loop versioning and loop peeling in order to enhance
2035    the alignment of data references in the loop.
2036
2037    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2038    original loop is to be vectorized.  Any other loops that are created by
2039    the transformations performed in this pass - are not supposed to be
2040    vectorized.  This restriction will be relaxed.
2041
2042    This pass will require a cost model to guide it whether to apply peeling
2043    or versioning or a combination of the two.  For example, the scheme that
2044    intel uses when given a loop with several memory accesses, is as follows:
2045    choose one memory access ('p') which alignment you want to force by doing
2046    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2047    other accesses are not necessarily aligned, or (2) use loop versioning to
2048    generate one loop in which all accesses are aligned, and another loop in
2049    which only 'p' is necessarily aligned.
2050
2051    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2052    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2053    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2054
2055    Devising a cost model is the most critical aspect of this work.  It will
2056    guide us on which access to peel for, whether to use loop versioning, how
2057    many versions to create, etc.  The cost model will probably consist of
2058    generic considerations as well as target specific considerations (on
2059    powerpc for example, misaligned stores are more painful than misaligned
2060    loads).
2061
2062    Here are the general steps involved in alignment enhancements:
2063
2064      -- original loop, before alignment analysis:
2065         for (i=0; i<N; i++){
2066           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2067           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2068         }
2069
2070      -- After vect_compute_data_refs_alignment:
2071         for (i=0; i<N; i++){
2072           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2073           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2074         }
2075
2076      -- Possibility 1: we do loop versioning:
2077      if (p is aligned) {
2078         for (i=0; i<N; i++){    # loop 1A
2079           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2080           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2081         }
2082      }
2083      else {
2084         for (i=0; i<N; i++){    # loop 1B
2085           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2086           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2087         }
2088      }
2089
2090      -- Possibility 2: we do loop peeling:
2091      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2092         x = q[i];
2093         p[i] = y;
2094      }
2095      for (i = 3; i < N; i++){   # loop 2A
2096         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2097         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2098      }
2099
2100      -- Possibility 3: combination of loop peeling and versioning:
2101      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2102         x = q[i];
2103         p[i] = y;
2104      }
2105      if (p is aligned) {
2106         for (i = 3; i<N; i++){  # loop 3A
2107           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2108           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2109         }
2110      }
2111      else {
2112         for (i = 3; i<N; i++){  # loop 3B
2113           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2114           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2115         }
2116      }
2117
2118      These loops are later passed to loop_transform to be vectorized.  The
2119      vectorizer will use the alignment information to guide the transformation
2120      (whether to generate regular loads/stores, or with special handling for
2121      misalignment).  */
2122
2123 opt_result
2124 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2125 {
2126   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2127   dr_vec_info *first_store = NULL;
2128   dr_vec_info *dr0_info = NULL;
2129   struct data_reference *dr;
2130   unsigned int i;
2131   bool do_peeling = false;
2132   bool do_versioning = false;
2133   unsigned int npeel = 0;
2134   bool one_misalignment_known = false;
2135   bool one_misalignment_unknown = false;
2136   bool one_dr_unsupportable = false;
2137   dr_vec_info *unsupportable_dr_info = NULL;
2138   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2139   hash_table<peel_info_hasher> peeling_htab (1);
2140
2141   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2142
2143   /* Reset data so we can safely be called multiple times.  */
2144   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2145   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2146
2147   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2148     return opt_result::success ();
2149
2150   /* Sort the vector of datarefs so DRs that have the same or dependent
2151      alignment are next to each other.  */
2152   auto_vec<data_reference_p> datarefs
2153     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2154   datarefs.qsort (dr_align_group_sort_cmp);
2155
2156   /* Compute the number of DRs that become aligned when we peel
2157      a dataref so it becomes aligned.  */
2158   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2159   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2160   unsigned i0;
2161   for (i0 = 0; i0 < datarefs.length (); ++i0)
2162     if (DR_BASE_ADDRESS (datarefs[i0]))
2163       break;
2164   for (i = i0 + 1; i <= datarefs.length (); ++i)
2165     {
2166       if (i == datarefs.length ()
2167           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2168                                DR_BASE_ADDRESS (datarefs[i]), 0)
2169           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2170                                DR_OFFSET (datarefs[i]), 0)
2171           || !operand_equal_p (DR_STEP (datarefs[i0]),
2172                                DR_STEP (datarefs[i]), 0))
2173         {
2174           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2175              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2176              will get known misalignment if we align one of the refs
2177              with the largest DR_TARGET_ALIGNMENT.  */
2178           for (unsigned j = i0; j < i; ++j)
2179             {
2180               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2181               for (unsigned k = i0; k < i; ++k)
2182                 {
2183                   if (k == j)
2184                     continue;
2185                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2186                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2187                                                                dr_infoj))
2188                     n_same_align_refs[j]++;
2189                 }
2190             }
2191           i0 = i;
2192         }
2193     }
2194
2195   /* While cost model enhancements are expected in the future, the high level
2196      view of the code at this time is as follows:
2197
2198      A) If there is a misaligned access then see if peeling to align
2199         this access can make all data references satisfy
2200         vect_supportable_dr_alignment.  If so, update data structures
2201         as needed and return true.
2202
2203      B) If peeling wasn't possible and there is a data reference with an
2204         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2205         then see if loop versioning checks can be used to make all data
2206         references satisfy vect_supportable_dr_alignment.  If so, update
2207         data structures as needed and return true.
2208
2209      C) If neither peeling nor versioning were successful then return false if
2210         any data reference does not satisfy vect_supportable_dr_alignment.
2211
2212      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2213
2214      Note, Possibility 3 above (which is peeling and versioning together) is not
2215      being done at this time.  */
2216
2217   /* (1) Peeling to force alignment.  */
2218
2219   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2220      Considerations:
2221      + How many accesses will become aligned due to the peeling
2222      - How many accesses will become unaligned due to the peeling,
2223        and the cost of misaligned accesses.
2224      - The cost of peeling (the extra runtime checks, the increase
2225        in code size).  */
2226
2227   FOR_EACH_VEC_ELT (datarefs, i, dr)
2228     {
2229       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2230       if (!vect_relevant_for_alignment_p (dr_info))
2231         continue;
2232
2233       stmt_vec_info stmt_info = dr_info->stmt;
2234       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2235       do_peeling = vector_alignment_reachable_p (dr_info);
2236       if (do_peeling)
2237         {
2238           if (known_alignment_for_access_p (dr_info, vectype))
2239             {
2240               unsigned int npeel_tmp = 0;
2241               bool negative = tree_int_cst_compare (DR_STEP (dr),
2242                                                     size_zero_node) < 0;
2243
2244               /* If known_alignment_for_access_p then we have set
2245                  DR_MISALIGNMENT which is only done if we know it at compiler
2246                  time, so it is safe to assume target alignment is constant.
2247                */
2248               unsigned int target_align =
2249                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2250               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2251               poly_int64 off = 0;
2252               if (negative)
2253                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2254               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2255               mis = negative ? mis : -mis;
2256               if (mis != 0)
2257                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2258
2259               /* For multiple types, it is possible that the bigger type access
2260                  will have more than one peeling option.  E.g., a loop with two
2261                  types: one of size (vector size / 4), and the other one of
2262                  size (vector size / 8).  Vectorization factor will 8.  If both
2263                  accesses are misaligned by 3, the first one needs one scalar
2264                  iteration to be aligned, and the second one needs 5.  But the
2265                  first one will be aligned also by peeling 5 scalar
2266                  iterations, and in that case both accesses will be aligned.
2267                  Hence, except for the immediate peeling amount, we also want
2268                  to try to add full vector size, while we don't exceed
2269                  vectorization factor.
2270                  We do this automatically for cost model, since we calculate
2271                  cost for every peeling option.  */
2272               poly_uint64 nscalars = npeel_tmp;
2273               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2274                 {
2275                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2276                   nscalars = (STMT_SLP_TYPE (stmt_info)
2277                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2278                 }
2279
2280               /* Save info about DR in the hash table.  Also include peeling
2281                  amounts according to the explanation above.  Indicate
2282                  the alignment status when the ref is not aligned.
2283                  ???  Rather than using unknown alignment here we should
2284                  prune all entries from the peeling hashtable which cause
2285                  DRs to be not supported.  */
2286               bool supportable_if_not_aligned
2287                 = vect_supportable_dr_alignment
2288                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2289               while (known_le (npeel_tmp, nscalars))
2290                 {
2291                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2292                                             dr_info, npeel_tmp,
2293                                             supportable_if_not_aligned);
2294                   npeel_tmp += MAX (1, target_align / dr_size);
2295                 }
2296
2297               one_misalignment_known = true;
2298             }
2299           else
2300             {
2301               /* If we don't know any misalignment values, we prefer
2302                  peeling for data-ref that has the maximum number of data-refs
2303                  with the same alignment, unless the target prefers to align
2304                  stores over load.  */
2305               unsigned same_align_drs = n_same_align_refs[i];
2306               if (!dr0_info
2307                   || dr0_same_align_drs < same_align_drs)
2308                 {
2309                   dr0_same_align_drs = same_align_drs;
2310                   dr0_info = dr_info;
2311                 }
2312               /* For data-refs with the same number of related
2313                  accesses prefer the one where the misalign
2314                  computation will be invariant in the outermost loop.  */
2315               else if (dr0_same_align_drs == same_align_drs)
2316                 {
2317                   class loop *ivloop0, *ivloop;
2318                   ivloop0 = outermost_invariant_loop_for_expr
2319                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2320                   ivloop = outermost_invariant_loop_for_expr
2321                     (loop, DR_BASE_ADDRESS (dr));
2322                   if ((ivloop && !ivloop0)
2323                       || (ivloop && ivloop0
2324                           && flow_loop_nested_p (ivloop, ivloop0)))
2325                     dr0_info = dr_info;
2326                 }
2327
2328               one_misalignment_unknown = true;
2329
2330               /* Check for data refs with unsupportable alignment that
2331                  can be peeled.  */
2332               enum dr_alignment_support supportable_dr_alignment
2333                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2334                                                  DR_MISALIGNMENT_UNKNOWN);
2335               if (supportable_dr_alignment == dr_unaligned_unsupported)
2336                 {
2337                   one_dr_unsupportable = true;
2338                   unsupportable_dr_info = dr_info;
2339                 }
2340
2341               if (!first_store && DR_IS_WRITE (dr))
2342                 {
2343                   first_store = dr_info;
2344                   first_store_same_align_drs = same_align_drs;
2345                 }
2346             }
2347         }
2348       else
2349         {
2350           if (!aligned_access_p (dr_info, vectype))
2351             {
2352               if (dump_enabled_p ())
2353                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354                                  "vector alignment may not be reachable\n");
2355               break;
2356             }
2357         }
2358     }
2359
2360   /* Check if we can possibly peel the loop.  */
2361   if (!vect_can_advance_ivs_p (loop_vinfo)
2362       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2363                                        loop_preheader_edge (loop))
2364       || loop->inner
2365       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2366     do_peeling = false;
2367
2368   struct _vect_peel_extended_info peel_for_known_alignment;
2369   struct _vect_peel_extended_info peel_for_unknown_alignment;
2370   struct _vect_peel_extended_info best_peel;
2371
2372   peel_for_unknown_alignment.inside_cost = INT_MAX;
2373   peel_for_unknown_alignment.outside_cost = INT_MAX;
2374   peel_for_unknown_alignment.peel_info.count = 0;
2375
2376   if (do_peeling
2377       && one_misalignment_unknown)
2378     {
2379       /* Check if the target requires to prefer stores over loads, i.e., if
2380          misaligned stores are more expensive than misaligned loads (taking
2381          drs with same alignment into account).  */
2382       unsigned int load_inside_cost = 0;
2383       unsigned int load_outside_cost = 0;
2384       unsigned int store_inside_cost = 0;
2385       unsigned int store_outside_cost = 0;
2386       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2387
2388       stmt_vector_for_cost dummy;
2389       dummy.create (2);
2390       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2391                                       &load_inside_cost,
2392                                       &load_outside_cost,
2393                                       &dummy, &dummy, estimated_npeels);
2394       dummy.release ();
2395
2396       if (first_store)
2397         {
2398           dummy.create (2);
2399           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2400                                           &store_inside_cost,
2401                                           &store_outside_cost,
2402                                           &dummy, &dummy,
2403                                           estimated_npeels);
2404           dummy.release ();
2405         }
2406       else
2407         {
2408           store_inside_cost = INT_MAX;
2409           store_outside_cost = INT_MAX;
2410         }
2411
2412       if (load_inside_cost > store_inside_cost
2413           || (load_inside_cost == store_inside_cost
2414               && load_outside_cost > store_outside_cost))
2415         {
2416           dr0_info = first_store;
2417           dr0_same_align_drs = first_store_same_align_drs;
2418           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2419           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2420         }
2421       else
2422         {
2423           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2424           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2425         }
2426
2427       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2428       prologue_cost_vec.create (2);
2429       epilogue_cost_vec.create (2);
2430
2431       int dummy2;
2432       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2433         (loop_vinfo, estimated_npeels, &dummy2,
2434          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2435          &prologue_cost_vec, &epilogue_cost_vec);
2436
2437       prologue_cost_vec.release ();
2438       epilogue_cost_vec.release ();
2439
2440       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2441     }
2442
2443   peel_for_unknown_alignment.peel_info.npeel = 0;
2444   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2445
2446   best_peel = peel_for_unknown_alignment;
2447
2448   peel_for_known_alignment.inside_cost = INT_MAX;
2449   peel_for_known_alignment.outside_cost = INT_MAX;
2450   peel_for_known_alignment.peel_info.count = 0;
2451   peel_for_known_alignment.peel_info.dr_info = NULL;
2452
2453   if (do_peeling && one_misalignment_known)
2454     {
2455       /* Peeling is possible, but there is no data access that is not supported
2456          unless aligned.  So we try to choose the best possible peeling from
2457          the hash table.  */
2458       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2459         (&peeling_htab, loop_vinfo);
2460     }
2461
2462   /* Compare costs of peeling for known and unknown alignment. */
2463   if (peel_for_known_alignment.peel_info.dr_info != NULL
2464       && peel_for_unknown_alignment.inside_cost
2465       >= peel_for_known_alignment.inside_cost)
2466     {
2467       best_peel = peel_for_known_alignment;
2468
2469       /* If the best peeling for known alignment has NPEEL == 0, perform no
2470          peeling at all except if there is an unsupportable dr that we can
2471          align.  */
2472       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2473         do_peeling = false;
2474     }
2475
2476   /* If there is an unsupportable data ref, prefer this over all choices so far
2477      since we'd have to discard a chosen peeling except when it accidentally
2478      aligned the unsupportable data ref.  */
2479   if (one_dr_unsupportable)
2480     dr0_info = unsupportable_dr_info;
2481   else if (do_peeling)
2482     {
2483       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2484          TODO: Use nopeel_outside_cost or get rid of it?  */
2485       unsigned nopeel_inside_cost = 0;
2486       unsigned nopeel_outside_cost = 0;
2487
2488       stmt_vector_for_cost dummy;
2489       dummy.create (2);
2490       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2491                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2492       dummy.release ();
2493
2494       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2495          costs will be recorded.  */
2496       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2497       prologue_cost_vec.create (2);
2498       epilogue_cost_vec.create (2);
2499
2500       int dummy2;
2501       nopeel_outside_cost += vect_get_known_peeling_cost
2502         (loop_vinfo, 0, &dummy2,
2503          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2504          &prologue_cost_vec, &epilogue_cost_vec);
2505
2506       prologue_cost_vec.release ();
2507       epilogue_cost_vec.release ();
2508
2509       npeel = best_peel.peel_info.npeel;
2510       dr0_info = best_peel.peel_info.dr_info;
2511
2512       /* If no peeling is not more expensive than the best peeling we
2513          have so far, don't perform any peeling.  */
2514       if (nopeel_inside_cost <= best_peel.inside_cost)
2515         do_peeling = false;
2516     }
2517
2518   if (do_peeling)
2519     {
2520       stmt_vec_info stmt_info = dr0_info->stmt;
2521       if (known_alignment_for_access_p (dr0_info,
2522                                         STMT_VINFO_VECTYPE (stmt_info)))
2523         {
2524           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2525                                                 size_zero_node) < 0;
2526           if (!npeel)
2527             {
2528               /* Since it's known at compile time, compute the number of
2529                  iterations in the peeled loop (the peeling factor) for use in
2530                  updating DR_MISALIGNMENT values.  The peeling factor is the
2531                  vectorization factor minus the misalignment as an element
2532                  count.  */
2533               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2534               poly_int64 off = 0;
2535               if (negative)
2536                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2537                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2538               unsigned int mis
2539                 = dr_misalignment (dr0_info, vectype, off);
2540               mis = negative ? mis : -mis;
2541               /* If known_alignment_for_access_p then we have set
2542                  DR_MISALIGNMENT which is only done if we know it at compiler
2543                  time, so it is safe to assume target alignment is constant.
2544                */
2545               unsigned int target_align =
2546                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2547               npeel = ((mis & (target_align - 1))
2548                        / vect_get_scalar_dr_size (dr0_info));
2549             }
2550
2551           /* For interleaved data access every iteration accesses all the
2552              members of the group, therefore we divide the number of iterations
2553              by the group size.  */
2554           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2555             npeel /= DR_GROUP_SIZE (stmt_info);
2556
2557           if (dump_enabled_p ())
2558             dump_printf_loc (MSG_NOTE, vect_location,
2559                              "Try peeling by %d\n", npeel);
2560         }
2561
2562       /* Ensure that all datarefs can be vectorized after the peel.  */
2563       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2564         do_peeling = false;
2565
2566       /* Check if all datarefs are supportable and log.  */
2567       if (do_peeling
2568           && npeel == 0
2569           && known_alignment_for_access_p (dr0_info,
2570                                            STMT_VINFO_VECTYPE (stmt_info)))
2571         return opt_result::success ();
2572
2573       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2574       if (do_peeling)
2575         {
2576           unsigned max_allowed_peel
2577             = param_vect_max_peeling_for_alignment;
2578           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2579             max_allowed_peel = 0;
2580           if (max_allowed_peel != (unsigned)-1)
2581             {
2582               unsigned max_peel = npeel;
2583               if (max_peel == 0)
2584                 {
2585                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2586                   unsigned HOST_WIDE_INT target_align_c;
2587                   if (target_align.is_constant (&target_align_c))
2588                     max_peel =
2589                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2590                   else
2591                     {
2592                       do_peeling = false;
2593                       if (dump_enabled_p ())
2594                         dump_printf_loc (MSG_NOTE, vect_location,
2595                           "Disable peeling, max peels set and vector"
2596                           " alignment unknown\n");
2597                     }
2598                 }
2599               if (max_peel > max_allowed_peel)
2600                 {
2601                   do_peeling = false;
2602                   if (dump_enabled_p ())
2603                     dump_printf_loc (MSG_NOTE, vect_location,
2604                         "Disable peeling, max peels reached: %d\n", max_peel);
2605                 }
2606             }
2607         }
2608
2609       /* Cost model #2 - if peeling may result in a remaining loop not
2610          iterating enough to be vectorized then do not peel.  Since this
2611          is a cost heuristic rather than a correctness decision, use the
2612          most likely runtime value for variable vectorization factors.  */
2613       if (do_peeling
2614           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2615         {
2616           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2617           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2618           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2619               < assumed_vf + max_peel)
2620             do_peeling = false;
2621         }
2622
2623       if (do_peeling)
2624         {
2625           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2626              If the misalignment of DR_i is identical to that of dr0 then set
2627              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2628              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2629              by the peeling factor times the element size of DR_i (MOD the
2630              vectorization factor times the size).  Otherwise, the
2631              misalignment of DR_i must be set to unknown.  */
2632           FOR_EACH_VEC_ELT (datarefs, i, dr)
2633             if (dr != dr0_info->dr)
2634               {
2635                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2636                 if (!vect_relevant_for_alignment_p (dr_info))
2637                   continue;
2638
2639                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2640               }
2641
2642           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2643           if (npeel)
2644             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2645           else
2646             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2647           SET_DR_MISALIGNMENT (dr0_info,
2648                                vect_dr_misalign_for_aligned_access (dr0_info));
2649           if (dump_enabled_p ())
2650             {
2651               dump_printf_loc (MSG_NOTE, vect_location,
2652                                "Alignment of access forced using peeling.\n");
2653               dump_printf_loc (MSG_NOTE, vect_location,
2654                                "Peeling for alignment will be applied.\n");
2655             }
2656
2657           /* The inside-loop cost will be accounted for in vectorizable_load
2658              and vectorizable_store correctly with adjusted alignments.
2659              Drop the body_cst_vec on the floor here.  */
2660           return opt_result::success ();
2661         }
2662     }
2663
2664   /* (2) Versioning to force alignment.  */
2665
2666   /* Try versioning if:
2667      1) optimize loop for speed and the cost-model is not cheap
2668      2) there is at least one unsupported misaligned data ref with an unknown
2669         misalignment, and
2670      3) all misaligned data refs with a known misalignment are supported, and
2671      4) the number of runtime alignment checks is within reason.  */
2672
2673   do_versioning
2674     = (optimize_loop_nest_for_speed_p (loop)
2675        && !loop->inner /* FORNOW */
2676        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2677
2678   if (do_versioning)
2679     {
2680       FOR_EACH_VEC_ELT (datarefs, i, dr)
2681         {
2682           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2683           if (!vect_relevant_for_alignment_p (dr_info))
2684             continue;
2685
2686           stmt_vec_info stmt_info = dr_info->stmt;
2687           if (STMT_VINFO_STRIDED_P (stmt_info))
2688             {
2689               do_versioning = false;
2690               break;
2691             }
2692
2693           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2694           bool negative = tree_int_cst_compare (DR_STEP (dr),
2695                                                 size_zero_node) < 0;
2696           poly_int64 off = 0;
2697           if (negative)
2698             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2699                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2700           int misalignment;
2701           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2702             continue;
2703
2704           enum dr_alignment_support supportable_dr_alignment
2705             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2706                                              misalignment);
2707           if (supportable_dr_alignment == dr_unaligned_unsupported)
2708             {
2709               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2710                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2711                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2712                 {
2713                   do_versioning = false;
2714                   break;
2715                 }
2716
2717               /* At present we don't support versioning for alignment
2718                  with variable VF, since there's no guarantee that the
2719                  VF is a power of two.  We could relax this if we added
2720                  a way of enforcing a power-of-two size.  */
2721               unsigned HOST_WIDE_INT size;
2722               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2723                 {
2724                   do_versioning = false;
2725                   break;
2726                 }
2727
2728               /* Forcing alignment in the first iteration is no good if
2729                  we don't keep it across iterations.  For now, just disable
2730                  versioning in this case.
2731                  ?? We could actually unroll the loop to achieve the required
2732                  overall step alignment, and forcing the alignment could be
2733                  done by doing some iterations of the non-vectorized loop.  */
2734               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2735                                * DR_STEP_ALIGNMENT (dr),
2736                                DR_TARGET_ALIGNMENT (dr_info)))
2737                 {
2738                   do_versioning = false;
2739                   break;
2740                 }
2741
2742               /* The rightmost bits of an aligned address must be zeros.
2743                  Construct the mask needed for this test.  For example,
2744                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2745                  mask must be 15 = 0xf. */
2746               int mask = size - 1;
2747
2748               /* FORNOW: use the same mask to test all potentially unaligned
2749                  references in the loop.  */
2750               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2751                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2752                 {
2753                   do_versioning = false;
2754                   break;
2755                 }
2756
2757               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2758               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2759             }
2760         }
2761
2762       /* Versioning requires at least one misaligned data reference.  */
2763       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2764         do_versioning = false;
2765       else if (!do_versioning)
2766         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2767     }
2768
2769   if (do_versioning)
2770     {
2771       const vec<stmt_vec_info> &may_misalign_stmts
2772         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2773       stmt_vec_info stmt_info;
2774
2775       /* It can now be assumed that the data references in the statements
2776          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2777          of the loop being vectorized.  */
2778       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2779         {
2780           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2781           SET_DR_MISALIGNMENT (dr_info,
2782                                vect_dr_misalign_for_aligned_access (dr_info));
2783           if (dump_enabled_p ())
2784             dump_printf_loc (MSG_NOTE, vect_location,
2785                              "Alignment of access forced using versioning.\n");
2786         }
2787
2788       if (dump_enabled_p ())
2789         dump_printf_loc (MSG_NOTE, vect_location,
2790                          "Versioning for alignment will be applied.\n");
2791
2792       /* Peeling and versioning can't be done together at this time.  */
2793       gcc_assert (! (do_peeling && do_versioning));
2794
2795       return opt_result::success ();
2796     }
2797
2798   /* This point is reached if neither peeling nor versioning is being done.  */
2799   gcc_assert (! (do_peeling || do_versioning));
2800
2801   return opt_result::success ();
2802 }
2803
2804
2805 /* Function vect_analyze_data_refs_alignment
2806
2807    Analyze the alignment of the data-references in the loop.
2808    Return FALSE if a data reference is found that cannot be vectorized.  */
2809
2810 opt_result
2811 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2812 {
2813   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2814
2815   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2816   struct data_reference *dr;
2817   unsigned int i;
2818
2819   vect_record_base_alignments (loop_vinfo);
2820   FOR_EACH_VEC_ELT (datarefs, i, dr)
2821     {
2822       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2823       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2824         {
2825           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2826               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2827             continue;
2828           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2829                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2830         }
2831     }
2832
2833   return opt_result::success ();
2834 }
2835
2836
2837 /* Analyze alignment of DRs of stmts in NODE.  */
2838
2839 static bool
2840 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2841 {
2842   /* Alignment is maintained in the first element of the group.  */
2843   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2844   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2845   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2846   tree vectype = SLP_TREE_VECTYPE (node);
2847   poly_uint64 vector_alignment
2848     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2849                  BITS_PER_UNIT);
2850   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2851     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2852   /* Re-analyze alignment when we're facing a vectorization with a bigger
2853      alignment requirement.  */
2854   else if (known_lt (dr_info->target_alignment, vector_alignment))
2855     {
2856       poly_uint64 old_target_alignment = dr_info->target_alignment;
2857       int old_misalignment = dr_info->misalignment;
2858       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2859       /* But keep knowledge about a smaller alignment.  */
2860       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2861           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2862         {
2863           dr_info->target_alignment = old_target_alignment;
2864           dr_info->misalignment = old_misalignment;
2865         }
2866     }
2867   /* When we ever face unordered target alignments the first one wins in terms
2868      of analyzing and the other will become unknown in dr_misalignment.  */
2869   return true;
2870 }
2871
2872 /* Function vect_slp_analyze_instance_alignment
2873
2874    Analyze the alignment of the data-references in the SLP instance.
2875    Return FALSE if a data reference is found that cannot be vectorized.  */
2876
2877 bool
2878 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2879                                                 slp_instance instance)
2880 {
2881   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2882
2883   slp_tree node;
2884   unsigned i;
2885   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2886     if (! vect_slp_analyze_node_alignment (vinfo, node))
2887       return false;
2888
2889   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2890       && ! vect_slp_analyze_node_alignment
2891              (vinfo, SLP_INSTANCE_TREE (instance)))
2892     return false;
2893
2894   return true;
2895 }
2896
2897
2898 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2899    accesses of legal size, step, etc.  Detect gaps, single element
2900    interleaving, and other special cases. Set grouped access info.
2901    Collect groups of strided stores for further use in SLP analysis.
2902    Worker for vect_analyze_group_access.  */
2903
2904 static bool
2905 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2906 {
2907   data_reference *dr = dr_info->dr;
2908   tree step = DR_STEP (dr);
2909   tree scalar_type = TREE_TYPE (DR_REF (dr));
2910   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2911   stmt_vec_info stmt_info = dr_info->stmt;
2912   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2913   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2914   HOST_WIDE_INT dr_step = -1;
2915   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2916   bool slp_impossible = false;
2917
2918   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2919      size of the interleaving group (including gaps).  */
2920   if (tree_fits_shwi_p (step))
2921     {
2922       dr_step = tree_to_shwi (step);
2923       /* Check that STEP is a multiple of type size.  Otherwise there is
2924          a non-element-sized gap at the end of the group which we
2925          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2926          ???  As we can handle non-constant step fine here we should
2927          simply remove uses of DR_GROUP_GAP between the last and first
2928          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2929          simply not include that gap.  */
2930       if ((dr_step % type_size) != 0)
2931         {
2932           if (dump_enabled_p ())
2933             dump_printf_loc (MSG_NOTE, vect_location,
2934                              "Step %T is not a multiple of the element size"
2935                              " for %T\n",
2936                              step, DR_REF (dr));
2937           return false;
2938         }
2939       groupsize = absu_hwi (dr_step) / type_size;
2940     }
2941   else
2942     groupsize = 0;
2943
2944   /* Not consecutive access is possible only if it is a part of interleaving.  */
2945   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2946     {
2947       /* Check if it this DR is a part of interleaving, and is a single
2948          element of the group that is accessed in the loop.  */
2949
2950       /* Gaps are supported only for loads. STEP must be a multiple of the type
2951          size.  */
2952       if (DR_IS_READ (dr)
2953           && (dr_step % type_size) == 0
2954           && groupsize > 0
2955           /* This could be UINT_MAX but as we are generating code in a very
2956              inefficient way we have to cap earlier.
2957              See PR91403 for example.  */
2958           && groupsize <= 4096)
2959         {
2960           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2961           DR_GROUP_SIZE (stmt_info) = groupsize;
2962           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2963           if (dump_enabled_p ())
2964             dump_printf_loc (MSG_NOTE, vect_location,
2965                              "Detected single element interleaving %T"
2966                              " step %T\n",
2967                              DR_REF (dr), step);
2968
2969           return true;
2970         }
2971
2972       if (dump_enabled_p ())
2973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                          "not consecutive access %G", stmt_info->stmt);
2975
2976       if (bb_vinfo)
2977         {
2978           /* Mark the statement as unvectorizable.  */
2979           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2980           return true;
2981         }
2982
2983       if (dump_enabled_p ())
2984         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2985       STMT_VINFO_STRIDED_P (stmt_info) = true;
2986       return true;
2987     }
2988
2989   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2990     {
2991       /* First stmt in the interleaving chain. Check the chain.  */
2992       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2993       struct data_reference *data_ref = dr;
2994       unsigned int count = 1;
2995       tree prev_init = DR_INIT (data_ref);
2996       HOST_WIDE_INT diff, gaps = 0;
2997
2998       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2999       while (next)
3000         {
3001           /* We never have the same DR multiple times.  */
3002           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3003                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3004
3005           data_ref = STMT_VINFO_DATA_REF (next);
3006
3007           /* All group members have the same STEP by construction.  */
3008           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3009
3010           /* Check that the distance between two accesses is equal to the type
3011              size. Otherwise, we have gaps.  */
3012           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3013                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3014           if (diff < 1 || diff > UINT_MAX)
3015             {
3016               /* For artificial testcases with array accesses with large
3017                  constant indices we can run into overflow issues which
3018                  can end up fooling the groupsize constraint below so
3019                  check the individual gaps (which are represented as
3020                  unsigned int) as well.  */
3021               if (dump_enabled_p ())
3022                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3023                                  "interleaved access with gap larger "
3024                                  "than representable\n");
3025               return false;
3026             }
3027           if (diff != 1)
3028             {
3029               /* FORNOW: SLP of accesses with gaps is not supported.  */
3030               slp_impossible = true;
3031               if (DR_IS_WRITE (data_ref))
3032                 {
3033                   if (dump_enabled_p ())
3034                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3035                                      "interleaved store with gaps\n");
3036                   return false;
3037                 }
3038
3039               gaps += diff - 1;
3040             }
3041
3042           last_accessed_element += diff;
3043
3044           /* Store the gap from the previous member of the group. If there is no
3045              gap in the access, DR_GROUP_GAP is always 1.  */
3046           DR_GROUP_GAP (next) = diff;
3047
3048           prev_init = DR_INIT (data_ref);
3049           next = DR_GROUP_NEXT_ELEMENT (next);
3050           /* Count the number of data-refs in the chain.  */
3051           count++;
3052         }
3053
3054       if (groupsize == 0)
3055         groupsize = count + gaps;
3056
3057       /* This could be UINT_MAX but as we are generating code in a very
3058          inefficient way we have to cap earlier.  See PR78699 for example.  */
3059       if (groupsize > 4096)
3060         {
3061           if (dump_enabled_p ())
3062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063                              "group is too large\n");
3064           return false;
3065         }
3066
3067       /* Check that the size of the interleaving is equal to count for stores,
3068          i.e., that there are no gaps.  */
3069       if (groupsize != count
3070           && !DR_IS_READ (dr))
3071         {
3072           groupsize = count;
3073           STMT_VINFO_STRIDED_P (stmt_info) = true;
3074         }
3075
3076       /* If there is a gap after the last load in the group it is the
3077          difference between the groupsize and the last accessed
3078          element.
3079          When there is no gap, this difference should be 0.  */
3080       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3081
3082       DR_GROUP_SIZE (stmt_info) = groupsize;
3083       if (dump_enabled_p ())
3084         {
3085           dump_printf_loc (MSG_NOTE, vect_location,
3086                            "Detected interleaving ");
3087           if (DR_IS_READ (dr))
3088             dump_printf (MSG_NOTE, "load ");
3089           else if (STMT_VINFO_STRIDED_P (stmt_info))
3090             dump_printf (MSG_NOTE, "strided store ");
3091           else
3092             dump_printf (MSG_NOTE, "store ");
3093           dump_printf (MSG_NOTE, "of size %u\n",
3094                        (unsigned)groupsize);
3095           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3096           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3097           while (next)
3098             {
3099               if (DR_GROUP_GAP (next) != 1)
3100                 dump_printf_loc (MSG_NOTE, vect_location,
3101                                  "\t<gap of %d elements>\n",
3102                                  DR_GROUP_GAP (next) - 1);
3103               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3104               next = DR_GROUP_NEXT_ELEMENT (next);
3105             }
3106           if (DR_GROUP_GAP (stmt_info) != 0)
3107             dump_printf_loc (MSG_NOTE, vect_location,
3108                              "\t<gap of %d elements>\n",
3109                              DR_GROUP_GAP (stmt_info));
3110         }
3111
3112       /* SLP: create an SLP data structure for every interleaving group of
3113          stores for further analysis in vect_analyse_slp.  */
3114       if (DR_IS_WRITE (dr) && !slp_impossible)
3115         {
3116           if (loop_vinfo)
3117             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3118           if (bb_vinfo)
3119             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3120         }
3121     }
3122
3123   return true;
3124 }
3125
3126 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3127    accesses of legal size, step, etc.  Detect gaps, single element
3128    interleaving, and other special cases. Set grouped access info.
3129    Collect groups of strided stores for further use in SLP analysis.  */
3130
3131 static bool
3132 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3133 {
3134   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3135     {
3136       /* Dissolve the group if present.  */
3137       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3138       while (stmt_info)
3139         {
3140           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3141           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3142           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3143           stmt_info = next;
3144         }
3145       return false;
3146     }
3147   return true;
3148 }
3149
3150 /* Analyze the access pattern of the data-reference DR_INFO.
3151    In case of non-consecutive accesses call vect_analyze_group_access() to
3152    analyze groups of accesses.  */
3153
3154 static bool
3155 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3156 {
3157   data_reference *dr = dr_info->dr;
3158   tree step = DR_STEP (dr);
3159   tree scalar_type = TREE_TYPE (DR_REF (dr));
3160   stmt_vec_info stmt_info = dr_info->stmt;
3161   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3162   class loop *loop = NULL;
3163
3164   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3165     return true;
3166
3167   if (loop_vinfo)
3168     loop = LOOP_VINFO_LOOP (loop_vinfo);
3169
3170   if (loop_vinfo && !step)
3171     {
3172       if (dump_enabled_p ())
3173         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3174                          "bad data-ref access in loop\n");
3175       return false;
3176     }
3177
3178   /* Allow loads with zero step in inner-loop vectorization.  */
3179   if (loop_vinfo && integer_zerop (step))
3180     {
3181       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3182       if (!nested_in_vect_loop_p (loop, stmt_info))
3183         return DR_IS_READ (dr);
3184       /* Allow references with zero step for outer loops marked
3185          with pragma omp simd only - it guarantees absence of
3186          loop-carried dependencies between inner loop iterations.  */
3187       if (loop->safelen < 2)
3188         {
3189           if (dump_enabled_p ())
3190             dump_printf_loc (MSG_NOTE, vect_location,
3191                              "zero step in inner loop of nest\n");
3192           return false;
3193         }
3194     }
3195
3196   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3197     {
3198       /* Interleaved accesses are not yet supported within outer-loop
3199         vectorization for references in the inner-loop.  */
3200       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3201
3202       /* For the rest of the analysis we use the outer-loop step.  */
3203       step = STMT_VINFO_DR_STEP (stmt_info);
3204       if (integer_zerop (step))
3205         {
3206           if (dump_enabled_p ())
3207             dump_printf_loc (MSG_NOTE, vect_location,
3208                              "zero step in outer loop.\n");
3209           return DR_IS_READ (dr);
3210         }
3211     }
3212
3213   /* Consecutive?  */
3214   if (TREE_CODE (step) == INTEGER_CST)
3215     {
3216       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3217       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3218           || (dr_step < 0
3219               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3220         {
3221           /* Mark that it is not interleaving.  */
3222           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3223           return true;
3224         }
3225     }
3226
3227   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3228     {
3229       if (dump_enabled_p ())
3230         dump_printf_loc (MSG_NOTE, vect_location,
3231                          "grouped access in outer loop.\n");
3232       return false;
3233     }
3234
3235
3236   /* Assume this is a DR handled by non-constant strided load case.  */
3237   if (TREE_CODE (step) != INTEGER_CST)
3238     return (STMT_VINFO_STRIDED_P (stmt_info)
3239             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3240                 || vect_analyze_group_access (vinfo, dr_info)));
3241
3242   /* Not consecutive access - check if it's a part of interleaving group.  */
3243   return vect_analyze_group_access (vinfo, dr_info);
3244 }
3245
3246 /* Compare two data-references DRA and DRB to group them into chunks
3247    suitable for grouping.  */
3248
3249 static int
3250 dr_group_sort_cmp (const void *dra_, const void *drb_)
3251 {
3252   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3253   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3254   data_reference_p dra = dra_info->dr;
3255   data_reference_p drb = drb_info->dr;
3256   int cmp;
3257
3258   /* Stabilize sort.  */
3259   if (dra == drb)
3260     return 0;
3261
3262   /* Different group IDs lead never belong to the same group.  */
3263   if (dra_info->group != drb_info->group)
3264     return dra_info->group < drb_info->group ? -1 : 1;
3265
3266   /* Ordering of DRs according to base.  */
3267   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3268                                DR_BASE_ADDRESS (drb));
3269   if (cmp != 0)
3270     return cmp;
3271
3272   /* And according to DR_OFFSET.  */
3273   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3274   if (cmp != 0)
3275     return cmp;
3276
3277   /* Put reads before writes.  */
3278   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3279     return DR_IS_READ (dra) ? -1 : 1;
3280
3281   /* Then sort after access size.  */
3282   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3283                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3284   if (cmp != 0)
3285     return cmp;
3286
3287   /* And after step.  */
3288   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3289   if (cmp != 0)
3290     return cmp;
3291
3292   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3293   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3294   if (cmp == 0)
3295     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3296   return cmp;
3297 }
3298
3299 /* If OP is the result of a conversion, return the unconverted value,
3300    otherwise return null.  */
3301
3302 static tree
3303 strip_conversion (tree op)
3304 {
3305   if (TREE_CODE (op) != SSA_NAME)
3306     return NULL_TREE;
3307   gimple *stmt = SSA_NAME_DEF_STMT (op);
3308   if (!is_gimple_assign (stmt)
3309       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3310     return NULL_TREE;
3311   return gimple_assign_rhs1 (stmt);
3312 }
3313
3314 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3315    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3316    be grouped in SLP mode.  */
3317
3318 static bool
3319 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3320                    bool allow_slp_p)
3321 {
3322   if (gimple_assign_single_p (stmt1_info->stmt))
3323     return gimple_assign_single_p (stmt2_info->stmt);
3324
3325   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3326   if (call1 && gimple_call_internal_p (call1))
3327     {
3328       /* Check for two masked loads or two masked stores.  */
3329       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3330       if (!call2 || !gimple_call_internal_p (call2))
3331         return false;
3332       internal_fn ifn = gimple_call_internal_fn (call1);
3333       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3334         return false;
3335       if (ifn != gimple_call_internal_fn (call2))
3336         return false;
3337
3338       /* Check that the masks are the same.  Cope with casts of masks,
3339          like those created by build_mask_conversion.  */
3340       tree mask1 = gimple_call_arg (call1, 2);
3341       tree mask2 = gimple_call_arg (call2, 2);
3342       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3343         {
3344           mask1 = strip_conversion (mask1);
3345           if (!mask1)
3346             return false;
3347           mask2 = strip_conversion (mask2);
3348           if (!mask2)
3349             return false;
3350           if (!operand_equal_p (mask1, mask2, 0))
3351             return false;
3352         }
3353       return true;
3354     }
3355
3356   return false;
3357 }
3358
3359 /* Function vect_analyze_data_ref_accesses.
3360
3361    Analyze the access pattern of all the data references in the loop.
3362
3363    FORNOW: the only access pattern that is considered vectorizable is a
3364            simple step 1 (consecutive) access.
3365
3366    FORNOW: handle only arrays and pointer accesses.  */
3367
3368 opt_result
3369 vect_analyze_data_ref_accesses (vec_info *vinfo,
3370                                 vec<int> *dataref_groups)
3371 {
3372   unsigned int i;
3373   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3374
3375   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3376
3377   if (datarefs.is_empty ())
3378     return opt_result::success ();
3379
3380   /* Sort the array of datarefs to make building the interleaving chains
3381      linear.  Don't modify the original vector's order, it is needed for
3382      determining what dependencies are reversed.  */
3383   vec<dr_vec_info *> datarefs_copy;
3384   datarefs_copy.create (datarefs.length ());
3385   for (unsigned i = 0; i < datarefs.length (); i++)
3386     {
3387       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3388       /* If the caller computed DR grouping use that, otherwise group by
3389          basic blocks.  */
3390       if (dataref_groups)
3391         dr_info->group = (*dataref_groups)[i];
3392       else
3393         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3394       datarefs_copy.quick_push (dr_info);
3395     }
3396   datarefs_copy.qsort (dr_group_sort_cmp);
3397   hash_set<stmt_vec_info> to_fixup;
3398
3399   /* Build the interleaving chains.  */
3400   for (i = 0; i < datarefs_copy.length () - 1;)
3401     {
3402       dr_vec_info *dr_info_a = datarefs_copy[i];
3403       data_reference_p dra = dr_info_a->dr;
3404       int dra_group_id = dr_info_a->group;
3405       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3406       stmt_vec_info lastinfo = NULL;
3407       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3408           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3409         {
3410           ++i;
3411           continue;
3412         }
3413       for (i = i + 1; i < datarefs_copy.length (); ++i)
3414         {
3415           dr_vec_info *dr_info_b = datarefs_copy[i];
3416           data_reference_p drb = dr_info_b->dr;
3417           int drb_group_id = dr_info_b->group;
3418           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3419           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3420               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3421             break;
3422
3423           /* ???  Imperfect sorting (non-compatible types, non-modulo
3424              accesses, same accesses) can lead to a group to be artificially
3425              split here as we don't just skip over those.  If it really
3426              matters we can push those to a worklist and re-iterate
3427              over them.  The we can just skip ahead to the next DR here.  */
3428
3429           /* DRs in a different DR group should not be put into the same
3430              interleaving group.  */
3431           if (dra_group_id != drb_group_id)
3432             break;
3433
3434           /* Check that the data-refs have same first location (except init)
3435              and they are both either store or load (not load and store,
3436              not masked loads or stores).  */
3437           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3438               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3439                                         DR_BASE_ADDRESS (drb)) != 0
3440               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3441               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3442             break;
3443
3444           /* Check that the data-refs have the same constant size.  */
3445           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3446           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3447           if (!tree_fits_uhwi_p (sza)
3448               || !tree_fits_uhwi_p (szb)
3449               || !tree_int_cst_equal (sza, szb))
3450             break;
3451
3452           /* Check that the data-refs have the same step.  */
3453           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3454             break;
3455
3456           /* Check the types are compatible.
3457              ???  We don't distinguish this during sorting.  */
3458           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3459                                    TREE_TYPE (DR_REF (drb))))
3460             break;
3461
3462           /* Check that the DR_INITs are compile-time constants.  */
3463           if (!tree_fits_shwi_p (DR_INIT (dra))
3464               || !tree_fits_shwi_p (DR_INIT (drb)))
3465             break;
3466
3467           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3468              just hold extra information.  */
3469           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3470               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3471               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3472             break;
3473
3474           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3475           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3476           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3477           HOST_WIDE_INT init_prev
3478             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3479           gcc_assert (init_a <= init_b
3480                       && init_a <= init_prev
3481                       && init_prev <= init_b);
3482
3483           /* Do not place the same access in the interleaving chain twice.  */
3484           if (init_b == init_prev)
3485             {
3486               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3487                           < gimple_uid (DR_STMT (drb)));
3488               /* Simply link in duplicates and fix up the chain below.  */
3489             }
3490           else
3491             {
3492               /* If init_b == init_a + the size of the type * k, we have an
3493                  interleaving, and DRA is accessed before DRB.  */
3494               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3495               if (type_size_a == 0
3496                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3497                       % type_size_a != 0))
3498                 break;
3499
3500               /* If we have a store, the accesses are adjacent.  This splits
3501                  groups into chunks we support (we don't support vectorization
3502                  of stores with gaps).  */
3503               if (!DR_IS_READ (dra)
3504                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3505                       != type_size_a))
3506                 break;
3507
3508               /* If the step (if not zero or non-constant) is smaller than the
3509                  difference between data-refs' inits this splits groups into
3510                  suitable sizes.  */
3511               if (tree_fits_shwi_p (DR_STEP (dra)))
3512                 {
3513                   unsigned HOST_WIDE_INT step
3514                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3515                   if (step != 0
3516                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3517                     break;
3518                 }
3519             }
3520
3521           if (dump_enabled_p ())
3522             dump_printf_loc (MSG_NOTE, vect_location,
3523                              DR_IS_READ (dra)
3524                              ? "Detected interleaving load %T and %T\n"
3525                              : "Detected interleaving store %T and %T\n",
3526                              DR_REF (dra), DR_REF (drb));
3527
3528           /* Link the found element into the group list.  */
3529           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3530             {
3531               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3532               lastinfo = stmtinfo_a;
3533             }
3534           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3535           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3536           lastinfo = stmtinfo_b;
3537
3538           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3539             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3540
3541           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3542             dump_printf_loc (MSG_NOTE, vect_location,
3543                              "Load suitable for SLP vectorization only.\n");
3544
3545           if (init_b == init_prev
3546               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3547               && dump_enabled_p ())
3548             dump_printf_loc (MSG_NOTE, vect_location,
3549                              "Queuing group with duplicate access for fixup\n");
3550         }
3551     }
3552
3553   /* Fixup groups with duplicate entries by splitting it.  */
3554   while (1)
3555     {
3556       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3557       if (!(it != to_fixup.end ()))
3558         break;
3559       stmt_vec_info grp = *it;
3560       to_fixup.remove (grp);
3561
3562       /* Find the earliest duplicate group member.  */
3563       unsigned first_duplicate = -1u;
3564       stmt_vec_info next, g = grp;
3565       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3566         {
3567           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3568                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3569               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3570             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3571           g = next;
3572         }
3573       if (first_duplicate == -1U)
3574         continue;
3575
3576       /* Then move all stmts after the first duplicate to a new group.
3577          Note this is a heuristic but one with the property that *it
3578          is fixed up completely.  */
3579       g = grp;
3580       stmt_vec_info newgroup = NULL, ng = grp;
3581       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3582         {
3583           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3584             {
3585               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3586               if (!newgroup)
3587                 newgroup = next;
3588               else
3589                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3590               ng = next;
3591               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3592             }
3593           else
3594             g = DR_GROUP_NEXT_ELEMENT (g);
3595         }
3596       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3597
3598       /* Fixup the new group which still may contain duplicates.  */
3599       to_fixup.add (newgroup);
3600     }
3601
3602   dr_vec_info *dr_info;
3603   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3604     {
3605       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3606           && !vect_analyze_data_ref_access (vinfo, dr_info))
3607         {
3608           if (dump_enabled_p ())
3609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610                              "not vectorized: complicated access pattern.\n");
3611
3612           if (is_a <bb_vec_info> (vinfo))
3613             {
3614               /* Mark the statement as not vectorizable.  */
3615               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3616               continue;
3617             }
3618           else
3619             {
3620               datarefs_copy.release ();
3621               return opt_result::failure_at (dr_info->stmt->stmt,
3622                                              "not vectorized:"
3623                                              " complicated access pattern.\n");
3624             }
3625         }
3626     }
3627
3628   datarefs_copy.release ();
3629   return opt_result::success ();
3630 }
3631
3632 /* Function vect_vfa_segment_size.
3633
3634    Input:
3635      DR_INFO: The data reference.
3636      LENGTH_FACTOR: segment length to consider.
3637
3638    Return a value suitable for the dr_with_seg_len::seg_len field.
3639    This is the "distance travelled" by the pointer from the first
3640    iteration in the segment to the last.  Note that it does not include
3641    the size of the access; in effect it only describes the first byte.  */
3642
3643 static tree
3644 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3645 {
3646   length_factor = size_binop (MINUS_EXPR,
3647                               fold_convert (sizetype, length_factor),
3648                               size_one_node);
3649   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3650                      length_factor);
3651 }
3652
3653 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3654    gives the worst-case number of bytes covered by the segment.  */
3655
3656 static unsigned HOST_WIDE_INT
3657 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3658 {
3659   stmt_vec_info stmt_vinfo = dr_info->stmt;
3660   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3661   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3662   unsigned HOST_WIDE_INT access_size = ref_size;
3663   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3664     {
3665       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3666       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3667     }
3668   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3669   int misalignment;
3670   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3671       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3672       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3673           == dr_explicit_realign_optimized))
3674     {
3675       /* We might access a full vector's worth.  */
3676       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3677     }
3678   return access_size;
3679 }
3680
3681 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3682    describes.  */
3683
3684 static unsigned int
3685 vect_vfa_align (dr_vec_info *dr_info)
3686 {
3687   return dr_alignment (dr_info->dr);
3688 }
3689
3690 /* Function vect_no_alias_p.
3691
3692    Given data references A and B with equal base and offset, see whether
3693    the alias relation can be decided at compilation time.  Return 1 if
3694    it can and the references alias, 0 if it can and the references do
3695    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3696    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3697    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3698
3699 static int
3700 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3701                          tree segment_length_a, tree segment_length_b,
3702                          unsigned HOST_WIDE_INT access_size_a,
3703                          unsigned HOST_WIDE_INT access_size_b)
3704 {
3705   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3706   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3707   poly_uint64 const_length_a;
3708   poly_uint64 const_length_b;
3709
3710   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3711      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3712      [a, a+12) */
3713   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3714     {
3715       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3716       offset_a -= const_length_a;
3717     }
3718   else
3719     const_length_a = tree_to_poly_uint64 (segment_length_a);
3720   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3721     {
3722       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3723       offset_b -= const_length_b;
3724     }
3725   else
3726     const_length_b = tree_to_poly_uint64 (segment_length_b);
3727
3728   const_length_a += access_size_a;
3729   const_length_b += access_size_b;
3730
3731   if (ranges_known_overlap_p (offset_a, const_length_a,
3732                               offset_b, const_length_b))
3733     return 1;
3734
3735   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3736                                offset_b, const_length_b))
3737     return 0;
3738
3739   return -1;
3740 }
3741
3742 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3743    in DDR is >= VF.  */
3744
3745 static bool
3746 dependence_distance_ge_vf (data_dependence_relation *ddr,
3747                            unsigned int loop_depth, poly_uint64 vf)
3748 {
3749   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3750       || DDR_NUM_DIST_VECTS (ddr) == 0)
3751     return false;
3752
3753   /* If the dependence is exact, we should have limited the VF instead.  */
3754   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3755
3756   unsigned int i;
3757   lambda_vector dist_v;
3758   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3759     {
3760       HOST_WIDE_INT dist = dist_v[loop_depth];
3761       if (dist != 0
3762           && !(dist > 0 && DDR_REVERSED_P (ddr))
3763           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3764         return false;
3765     }
3766
3767   if (dump_enabled_p ())
3768     dump_printf_loc (MSG_NOTE, vect_location,
3769                      "dependence distance between %T and %T is >= VF\n",
3770                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3771
3772   return true;
3773 }
3774
3775 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3776
3777 static void
3778 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3779 {
3780   dump_printf (dump_kind, "%s (%T) >= ",
3781                lower_bound.unsigned_p ? "unsigned" : "abs",
3782                lower_bound.expr);
3783   dump_dec (dump_kind, lower_bound.min_value);
3784 }
3785
3786 /* Record that the vectorized loop requires the vec_lower_bound described
3787    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3788
3789 static void
3790 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3791                         poly_uint64 min_value)
3792 {
3793   vec<vec_lower_bound> &lower_bounds
3794     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3795   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3796     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3797       {
3798         unsigned_p &= lower_bounds[i].unsigned_p;
3799         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3800         if (lower_bounds[i].unsigned_p != unsigned_p
3801             || maybe_lt (lower_bounds[i].min_value, min_value))
3802           {
3803             lower_bounds[i].unsigned_p = unsigned_p;
3804             lower_bounds[i].min_value = min_value;
3805             if (dump_enabled_p ())
3806               {
3807                 dump_printf_loc (MSG_NOTE, vect_location,
3808                                  "updating run-time check to ");
3809                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3810                 dump_printf (MSG_NOTE, "\n");
3811               }
3812           }
3813         return;
3814       }
3815
3816   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3817   if (dump_enabled_p ())
3818     {
3819       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3820       dump_lower_bound (MSG_NOTE, lower_bound);
3821       dump_printf (MSG_NOTE, "\n");
3822     }
3823   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3824 }
3825
3826 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3827    will span fewer than GAP bytes.  */
3828
3829 static bool
3830 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3831                   poly_int64 gap)
3832 {
3833   stmt_vec_info stmt_info = dr_info->stmt;
3834   HOST_WIDE_INT count
3835     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3836   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3837     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3838   return (estimated_poly_value (gap)
3839           <= count * vect_get_scalar_dr_size (dr_info));
3840 }
3841
3842 /* Return true if we know that there is no alias between DR_INFO_A and
3843    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3844    When returning true, set *LOWER_BOUND_OUT to this N.  */
3845
3846 static bool
3847 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3848                                 poly_uint64 *lower_bound_out)
3849 {
3850   /* Check that there is a constant gap of known sign between DR_A
3851      and DR_B.  */
3852   data_reference *dr_a = dr_info_a->dr;
3853   data_reference *dr_b = dr_info_b->dr;
3854   poly_int64 init_a, init_b;
3855   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3856       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3857       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3858       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3859       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3860       || !ordered_p (init_a, init_b))
3861     return false;
3862
3863   /* Sort DR_A and DR_B by the address they access.  */
3864   if (maybe_lt (init_b, init_a))
3865     {
3866       std::swap (init_a, init_b);
3867       std::swap (dr_info_a, dr_info_b);
3868       std::swap (dr_a, dr_b);
3869     }
3870
3871   /* If the two accesses could be dependent within a scalar iteration,
3872      make sure that we'd retain their order.  */
3873   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3874       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3875     return false;
3876
3877   /* There is no alias if abs (DR_STEP) is greater than or equal to
3878      the bytes spanned by the combination of the two accesses.  */
3879   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3880   return true;
3881 }
3882
3883 /* Function vect_prune_runtime_alias_test_list.
3884
3885    Prune a list of ddrs to be tested at run-time by versioning for alias.
3886    Merge several alias checks into one if possible.
3887    Return FALSE if resulting list of ddrs is longer then allowed by
3888    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3889
3890 opt_result
3891 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3892 {
3893   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3894   hash_set <tree_pair_hash> compared_objects;
3895
3896   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3897   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3898     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3899   const vec<vec_object_pair> &check_unequal_addrs
3900     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3901   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3902   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3903
3904   ddr_p ddr;
3905   unsigned int i;
3906   tree length_factor;
3907
3908   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3909
3910   /* Step values are irrelevant for aliasing if the number of vector
3911      iterations is equal to the number of scalar iterations (which can
3912      happen for fully-SLP loops).  */
3913   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3914
3915   if (!vf_one_p)
3916     {
3917       /* Convert the checks for nonzero steps into bound tests.  */
3918       tree value;
3919       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3920         vect_check_lower_bound (loop_vinfo, value, true, 1);
3921     }
3922
3923   if (may_alias_ddrs.is_empty ())
3924     return opt_result::success ();
3925
3926   comp_alias_ddrs.create (may_alias_ddrs.length ());
3927
3928   unsigned int loop_depth
3929     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3930                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3931
3932   /* First, we collect all data ref pairs for aliasing checks.  */
3933   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3934     {
3935       poly_uint64 lower_bound;
3936       tree segment_length_a, segment_length_b;
3937       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3938       unsigned int align_a, align_b;
3939
3940       /* Ignore the alias if the VF we chose ended up being no greater
3941          than the dependence distance.  */
3942       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3943         continue;
3944
3945       if (DDR_OBJECT_A (ddr))
3946         {
3947           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3948           if (!compared_objects.add (new_pair))
3949             {
3950               if (dump_enabled_p ())
3951                 dump_printf_loc (MSG_NOTE, vect_location,
3952                                  "checking that %T and %T"
3953                                  " have different addresses\n",
3954                                  new_pair.first, new_pair.second);
3955               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3956             }
3957           continue;
3958         }
3959
3960       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3961       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3962
3963       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3964       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3965
3966       bool preserves_scalar_order_p
3967         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3968       bool ignore_step_p
3969           = (vf_one_p
3970              && (preserves_scalar_order_p
3971                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3972                                      DR_STEP (dr_info_b->dr))));
3973
3974       /* Skip the pair if inter-iteration dependencies are irrelevant
3975          and intra-iteration dependencies are guaranteed to be honored.  */
3976       if (ignore_step_p
3977           && (preserves_scalar_order_p
3978               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3979                                                  &lower_bound)))
3980         {
3981           if (dump_enabled_p ())
3982             dump_printf_loc (MSG_NOTE, vect_location,
3983                              "no need for alias check between "
3984                              "%T and %T when VF is 1\n",
3985                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3986           continue;
3987         }
3988
3989       /* See whether we can handle the alias using a bounds check on
3990          the step, and whether that's likely to be the best approach.
3991          (It might not be, for example, if the minimum step is much larger
3992          than the number of bytes handled by one vector iteration.)  */
3993       if (!ignore_step_p
3994           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3995           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3996                                              &lower_bound)
3997           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3998               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3999         {
4000           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4001           if (dump_enabled_p ())
4002             {
4003               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4004                                "%T and %T when the step %T is outside ",
4005                                DR_REF (dr_info_a->dr),
4006                                DR_REF (dr_info_b->dr),
4007                                DR_STEP (dr_info_a->dr));
4008               if (unsigned_p)
4009                 dump_printf (MSG_NOTE, "[0");
4010               else
4011                 {
4012                   dump_printf (MSG_NOTE, "(");
4013                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4014                 }
4015               dump_printf (MSG_NOTE, ", ");
4016               dump_dec (MSG_NOTE, lower_bound);
4017               dump_printf (MSG_NOTE, ")\n");
4018             }
4019           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4020                                   unsigned_p, lower_bound);
4021           continue;
4022         }
4023
4024       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4025       if (dr_group_first_a)
4026         {
4027           stmt_info_a = dr_group_first_a;
4028           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4029         }
4030
4031       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4032       if (dr_group_first_b)
4033         {
4034           stmt_info_b = dr_group_first_b;
4035           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4036         }
4037
4038       if (ignore_step_p)
4039         {
4040           segment_length_a = size_zero_node;
4041           segment_length_b = size_zero_node;
4042         }
4043       else
4044         {
4045           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4046                                 DR_STEP (dr_info_b->dr), 0))
4047             length_factor = scalar_loop_iters;
4048           else
4049             length_factor = size_int (vect_factor);
4050           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4051           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4052         }
4053       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4054       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4055       align_a = vect_vfa_align (dr_info_a);
4056       align_b = vect_vfa_align (dr_info_b);
4057
4058       /* See whether the alias is known at compilation time.  */
4059       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4060                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4061           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4062                               DR_OFFSET (dr_info_b->dr), 0)
4063           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4064           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4065           && poly_int_tree_p (segment_length_a)
4066           && poly_int_tree_p (segment_length_b))
4067         {
4068           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4069                                              segment_length_a,
4070                                              segment_length_b,
4071                                              access_size_a,
4072                                              access_size_b);
4073           if (res >= 0 && dump_enabled_p ())
4074             {
4075               dump_printf_loc (MSG_NOTE, vect_location,
4076                                "can tell at compile time that %T and %T",
4077                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4078               if (res == 0)
4079                 dump_printf (MSG_NOTE, " do not alias\n");
4080               else
4081                 dump_printf (MSG_NOTE, " alias\n");
4082             }
4083
4084           if (res == 0)
4085             continue;
4086
4087           if (res == 1)
4088             return opt_result::failure_at (stmt_info_b->stmt,
4089                                            "not vectorized:"
4090                                            " compilation time alias: %G%G",
4091                                            stmt_info_a->stmt,
4092                                            stmt_info_b->stmt);
4093         }
4094
4095       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4096                             access_size_a, align_a);
4097       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4098                             access_size_b, align_b);
4099       /* Canonicalize the order to be the one that's needed for accurate
4100          RAW, WAR and WAW flags, in cases where the data references are
4101          well-ordered.  The order doesn't really matter otherwise,
4102          but we might as well be consistent.  */
4103       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4104         std::swap (dr_a, dr_b);
4105
4106       dr_with_seg_len_pair_t dr_with_seg_len_pair
4107         (dr_a, dr_b, (preserves_scalar_order_p
4108                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4109                       : dr_with_seg_len_pair_t::REORDERED));
4110
4111       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4112     }
4113
4114   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4115
4116   unsigned int count = (comp_alias_ddrs.length ()
4117                         + check_unequal_addrs.length ());
4118
4119   if (count
4120       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4121           == VECT_COST_MODEL_VERY_CHEAP))
4122     return opt_result::failure_at
4123       (vect_location, "would need a runtime alias check\n");
4124
4125   if (dump_enabled_p ())
4126     dump_printf_loc (MSG_NOTE, vect_location,
4127                      "improved number of alias checks from %d to %d\n",
4128                      may_alias_ddrs.length (), count);
4129   unsigned limit = param_vect_max_version_for_alias_checks;
4130   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4131     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4132   if (count > limit)
4133     return opt_result::failure_at
4134       (vect_location,
4135        "number of versioning for alias run-time tests exceeds %d "
4136        "(--param vect-max-version-for-alias-checks)\n", limit);
4137
4138   return opt_result::success ();
4139 }
4140
4141 /* Check whether we can use an internal function for a gather load
4142    or scatter store.  READ_P is true for loads and false for stores.
4143    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4144    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4145    is the type of the offset that is being applied to the invariant
4146    base address.  SCALE is the amount by which the offset should
4147    be multiplied *after* it has been converted to address width.
4148
4149    Return true if the function is supported, storing the function id in
4150    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4151
4152 bool
4153 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4154                           tree vectype, tree memory_type, tree offset_type,
4155                           int scale, internal_fn *ifn_out,
4156                           tree *offset_vectype_out)
4157 {
4158   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4159   unsigned int element_bits = vector_element_bits (vectype);
4160   if (element_bits != memory_bits)
4161     /* For now the vector elements must be the same width as the
4162        memory elements.  */
4163     return false;
4164
4165   /* Work out which function we need.  */
4166   internal_fn ifn, alt_ifn, alt_ifn2;
4167   if (read_p)
4168     {
4169       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4170       alt_ifn = IFN_MASK_GATHER_LOAD;
4171       /* When target supports MASK_LEN_GATHER_LOAD, we always
4172          use MASK_LEN_GATHER_LOAD regardless whether len and
4173          mask are valid or not.  */
4174       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4175     }
4176   else
4177     {
4178       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4179       alt_ifn = IFN_MASK_SCATTER_STORE;
4180       /* When target supports MASK_LEN_SCATTER_STORE, we always
4181          use MASK_LEN_SCATTER_STORE regardless whether len and
4182          mask are valid or not.  */
4183       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4184     }
4185
4186   for (;;)
4187     {
4188       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4189       if (!offset_vectype)
4190         return false;
4191
4192       /* Test whether the target supports this combination.  */
4193       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4194                                                   offset_vectype, scale))
4195         {
4196           *ifn_out = ifn;
4197           *offset_vectype_out = offset_vectype;
4198           return true;
4199         }
4200       else if (!masked_p
4201                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4202                                                           memory_type,
4203                                                           offset_vectype,
4204                                                           scale))
4205         {
4206           *ifn_out = alt_ifn;
4207           *offset_vectype_out = offset_vectype;
4208           return true;
4209         }
4210       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4211                                                        memory_type,
4212                                                        offset_vectype, scale))
4213         {
4214           *ifn_out = alt_ifn2;
4215           *offset_vectype_out = offset_vectype;
4216           return true;
4217         }
4218
4219       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4220           && TYPE_PRECISION (offset_type) >= element_bits)
4221         return false;
4222
4223       offset_type = build_nonstandard_integer_type
4224         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4225     }
4226 }
4227
4228 /* STMT_INFO is a call to an internal gather load or scatter store function.
4229    Describe the operation in INFO.  */
4230
4231 static void
4232 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4233                                    gather_scatter_info *info)
4234 {
4235   gcall *call = as_a <gcall *> (stmt_info->stmt);
4236   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4237   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4238
4239   info->ifn = gimple_call_internal_fn (call);
4240   info->decl = NULL_TREE;
4241   info->base = gimple_call_arg (call, 0);
4242   info->offset = gimple_call_arg (call, 1);
4243   info->offset_dt = vect_unknown_def_type;
4244   info->offset_vectype = NULL_TREE;
4245   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4246   info->element_type = TREE_TYPE (vectype);
4247   info->memory_type = TREE_TYPE (DR_REF (dr));
4248 }
4249
4250 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4251    gather load or scatter store.  Describe the operation in *INFO if so.  */
4252
4253 bool
4254 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4255                            gather_scatter_info *info)
4256 {
4257   HOST_WIDE_INT scale = 1;
4258   poly_int64 pbitpos, pbitsize;
4259   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4260   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4261   tree offtype = NULL_TREE;
4262   tree decl = NULL_TREE, base, off;
4263   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4264   tree memory_type = TREE_TYPE (DR_REF (dr));
4265   machine_mode pmode;
4266   int punsignedp, reversep, pvolatilep = 0;
4267   internal_fn ifn;
4268   tree offset_vectype;
4269   bool masked_p = false;
4270
4271   /* See whether this is already a call to a gather/scatter internal function.
4272      If not, see whether it's a masked load or store.  */
4273   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4274   if (call && gimple_call_internal_p (call))
4275     {
4276       ifn = gimple_call_internal_fn (call);
4277       if (internal_gather_scatter_fn_p (ifn))
4278         {
4279           vect_describe_gather_scatter_call (stmt_info, info);
4280           return true;
4281         }
4282       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4283     }
4284
4285   /* True if we should aim to use internal functions rather than
4286      built-in functions.  */
4287   bool use_ifn_p = (DR_IS_READ (dr)
4288                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4289                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4290
4291   base = DR_REF (dr);
4292   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4293      see if we can use the def stmt of the address.  */
4294   if (masked_p
4295       && TREE_CODE (base) == MEM_REF
4296       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4297       && integer_zerop (TREE_OPERAND (base, 1))
4298       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4299     {
4300       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4301       if (is_gimple_assign (def_stmt)
4302           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4303         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4304     }
4305
4306   /* The gather and scatter builtins need address of the form
4307      loop_invariant + vector * {1, 2, 4, 8}
4308      or
4309      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4310      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4311      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4312      multiplications and additions in it.  To get a vector, we need
4313      a single SSA_NAME that will be defined in the loop and will
4314      contain everything that is not loop invariant and that can be
4315      vectorized.  The following code attempts to find such a preexistng
4316      SSA_NAME OFF and put the loop invariants into a tree BASE
4317      that can be gimplified before the loop.  */
4318   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4319                               &punsignedp, &reversep, &pvolatilep);
4320   if (reversep)
4321     return false;
4322
4323   /* PR 107346.  Packed structs can have fields at offsets that are not
4324      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4325   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4326     return false;
4327
4328   /* We need to be able to form an address to the base which for example
4329      isn't possible for hard registers.  */
4330   if (may_be_nonaddressable_p (base))
4331     return false;
4332
4333   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4334
4335   if (TREE_CODE (base) == MEM_REF)
4336     {
4337       if (!integer_zerop (TREE_OPERAND (base, 1)))
4338         {
4339           if (off == NULL_TREE)
4340             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4341           else
4342             off = size_binop (PLUS_EXPR, off,
4343                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4344         }
4345       base = TREE_OPERAND (base, 0);
4346     }
4347   else
4348     base = build_fold_addr_expr (base);
4349
4350   if (off == NULL_TREE)
4351     off = size_zero_node;
4352
4353   /* If base is not loop invariant, either off is 0, then we start with just
4354      the constant offset in the loop invariant BASE and continue with base
4355      as OFF, otherwise give up.
4356      We could handle that case by gimplifying the addition of base + off
4357      into some SSA_NAME and use that as off, but for now punt.  */
4358   if (!expr_invariant_in_loop_p (loop, base))
4359     {
4360       if (!integer_zerop (off))
4361         return false;
4362       off = base;
4363       base = size_int (pbytepos);
4364     }
4365   /* Otherwise put base + constant offset into the loop invariant BASE
4366      and continue with OFF.  */
4367   else
4368     {
4369       base = fold_convert (sizetype, base);
4370       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4371     }
4372
4373   /* OFF at this point may be either a SSA_NAME or some tree expression
4374      from get_inner_reference.  Try to peel off loop invariants from it
4375      into BASE as long as possible.  */
4376   STRIP_NOPS (off);
4377   while (offtype == NULL_TREE)
4378     {
4379       enum tree_code code;
4380       tree op0, op1, add = NULL_TREE;
4381
4382       if (TREE_CODE (off) == SSA_NAME)
4383         {
4384           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4385
4386           if (expr_invariant_in_loop_p (loop, off))
4387             return false;
4388
4389           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4390             break;
4391
4392           op0 = gimple_assign_rhs1 (def_stmt);
4393           code = gimple_assign_rhs_code (def_stmt);
4394           op1 = gimple_assign_rhs2 (def_stmt);
4395         }
4396       else
4397         {
4398           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4399             return false;
4400           code = TREE_CODE (off);
4401           extract_ops_from_tree (off, &code, &op0, &op1);
4402         }
4403       switch (code)
4404         {
4405         case POINTER_PLUS_EXPR:
4406         case PLUS_EXPR:
4407           if (expr_invariant_in_loop_p (loop, op0))
4408             {
4409               add = op0;
4410               off = op1;
4411             do_add:
4412               add = fold_convert (sizetype, add);
4413               if (scale != 1)
4414                 add = size_binop (MULT_EXPR, add, size_int (scale));
4415               base = size_binop (PLUS_EXPR, base, add);
4416               continue;
4417             }
4418           if (expr_invariant_in_loop_p (loop, op1))
4419             {
4420               add = op1;
4421               off = op0;
4422               goto do_add;
4423             }
4424           break;
4425         case MINUS_EXPR:
4426           if (expr_invariant_in_loop_p (loop, op1))
4427             {
4428               add = fold_convert (sizetype, op1);
4429               add = size_binop (MINUS_EXPR, size_zero_node, add);
4430               off = op0;
4431               goto do_add;
4432             }
4433           break;
4434         case MULT_EXPR:
4435           if (scale == 1 && tree_fits_shwi_p (op1))
4436             {
4437               int new_scale = tree_to_shwi (op1);
4438               /* Only treat this as a scaling operation if the target
4439                  supports it for at least some offset type.  */
4440               if (use_ifn_p
4441                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4442                                                 masked_p, vectype, memory_type,
4443                                                 signed_char_type_node,
4444                                                 new_scale, &ifn,
4445                                                 &offset_vectype)
4446                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4447                                                 masked_p, vectype, memory_type,
4448                                                 unsigned_char_type_node,
4449                                                 new_scale, &ifn,
4450                                                 &offset_vectype))
4451                 break;
4452               scale = new_scale;
4453               off = op0;
4454               continue;
4455             }
4456           break;
4457         case SSA_NAME:
4458           off = op0;
4459           continue;
4460         CASE_CONVERT:
4461           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4462               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4463             break;
4464
4465           /* Don't include the conversion if the target is happy with
4466              the current offset type.  */
4467           if (use_ifn_p
4468               && TREE_CODE (off) == SSA_NAME
4469               && !POINTER_TYPE_P (TREE_TYPE (off))
4470               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4471                                            masked_p, vectype, memory_type,
4472                                            TREE_TYPE (off), scale, &ifn,
4473                                            &offset_vectype))
4474             break;
4475
4476           if (TYPE_PRECISION (TREE_TYPE (op0))
4477               == TYPE_PRECISION (TREE_TYPE (off)))
4478             {
4479               off = op0;
4480               continue;
4481             }
4482
4483           /* Include the conversion if it is widening and we're using
4484              the IFN path or the target can handle the converted from
4485              offset or the current size is not already the same as the
4486              data vector element size.  */
4487           if ((TYPE_PRECISION (TREE_TYPE (op0))
4488                < TYPE_PRECISION (TREE_TYPE (off)))
4489               && (use_ifn_p
4490                   || (DR_IS_READ (dr)
4491                       ? (targetm.vectorize.builtin_gather
4492                          && targetm.vectorize.builtin_gather (vectype,
4493                                                               TREE_TYPE (op0),
4494                                                               scale))
4495                       : (targetm.vectorize.builtin_scatter
4496                          && targetm.vectorize.builtin_scatter (vectype,
4497                                                                TREE_TYPE (op0),
4498                                                                scale)))
4499                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4500                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4501             {
4502               off = op0;
4503               offtype = TREE_TYPE (off);
4504               STRIP_NOPS (off);
4505               continue;
4506             }
4507           break;
4508         default:
4509           break;
4510         }
4511       break;
4512     }
4513
4514   /* If at the end OFF still isn't a SSA_NAME or isn't
4515      defined in the loop, punt.  */
4516   if (TREE_CODE (off) != SSA_NAME
4517       || expr_invariant_in_loop_p (loop, off))
4518     return false;
4519
4520   if (offtype == NULL_TREE)
4521     offtype = TREE_TYPE (off);
4522
4523   if (use_ifn_p)
4524     {
4525       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4526                                      vectype, memory_type, offtype, scale,
4527                                      &ifn, &offset_vectype))
4528         ifn = IFN_LAST;
4529       decl = NULL_TREE;
4530     }
4531   else
4532     {
4533       if (DR_IS_READ (dr))
4534         {
4535           if (targetm.vectorize.builtin_gather)
4536             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4537         }
4538       else
4539         {
4540           if (targetm.vectorize.builtin_scatter)
4541             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4542         }
4543       ifn = IFN_LAST;
4544       /* The offset vector type will be read from DECL when needed.  */
4545       offset_vectype = NULL_TREE;
4546     }
4547
4548   info->ifn = ifn;
4549   info->decl = decl;
4550   info->base = base;
4551   info->offset = off;
4552   info->offset_dt = vect_unknown_def_type;
4553   info->offset_vectype = offset_vectype;
4554   info->scale = scale;
4555   info->element_type = TREE_TYPE (vectype);
4556   info->memory_type = memory_type;
4557   return true;
4558 }
4559
4560 /* Find the data references in STMT, analyze them with respect to LOOP and
4561    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4562    be handled.  */
4563
4564 opt_result
4565 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4566                                vec<data_reference_p> *datarefs,
4567                                vec<int> *dataref_groups, int group_id)
4568 {
4569   /* We can ignore clobbers for dataref analysis - they are removed during
4570      loop vectorization and BB vectorization checks dependences with a
4571      stmt walk.  */
4572   if (gimple_clobber_p (stmt))
4573     return opt_result::success ();
4574
4575   if (gimple_has_volatile_ops (stmt))
4576     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4577                                    stmt);
4578
4579   if (stmt_can_throw_internal (cfun, stmt))
4580     return opt_result::failure_at (stmt,
4581                                    "not vectorized:"
4582                                    " statement can throw an exception: %G",
4583                                    stmt);
4584
4585   auto_vec<data_reference_p, 2> refs;
4586   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4587   if (!res)
4588     return res;
4589
4590   if (refs.is_empty ())
4591     return opt_result::success ();
4592
4593   if (refs.length () > 1)
4594     {
4595       while (!refs.is_empty ())
4596         free_data_ref (refs.pop ());
4597       return opt_result::failure_at (stmt,
4598                                      "not vectorized: more than one "
4599                                      "data ref in stmt: %G", stmt);
4600     }
4601
4602   data_reference_p dr = refs.pop ();
4603   if (gcall *call = dyn_cast <gcall *> (stmt))
4604     if (!gimple_call_internal_p (call)
4605         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4606             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4607       {
4608         free_data_ref (dr);
4609         return opt_result::failure_at (stmt,
4610                                        "not vectorized: dr in a call %G", stmt);
4611       }
4612
4613   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4614       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4615     {
4616       free_data_ref (dr);
4617       return opt_result::failure_at (stmt,
4618                                      "not vectorized:"
4619                                      " statement is an unsupported"
4620                                      " bitfield access %G", stmt);
4621     }
4622
4623   if (DR_BASE_ADDRESS (dr)
4624       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4625     {
4626       free_data_ref (dr);
4627       return opt_result::failure_at (stmt,
4628                                      "not vectorized:"
4629                                      " base addr of dr is a constant\n");
4630     }
4631
4632   /* Check whether this may be a SIMD lane access and adjust the
4633      DR to make it easier for us to handle it.  */
4634   if (loop
4635       && loop->simduid
4636       && (!DR_BASE_ADDRESS (dr)
4637           || !DR_OFFSET (dr)
4638           || !DR_INIT (dr)
4639           || !DR_STEP (dr)))
4640     {
4641       struct data_reference *newdr
4642         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4643                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4644       if (DR_BASE_ADDRESS (newdr)
4645           && DR_OFFSET (newdr)
4646           && DR_INIT (newdr)
4647           && DR_STEP (newdr)
4648           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4649           && integer_zerop (DR_STEP (newdr)))
4650         {
4651           tree base_address = DR_BASE_ADDRESS (newdr);
4652           tree off = DR_OFFSET (newdr);
4653           tree step = ssize_int (1);
4654           if (integer_zerop (off)
4655               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4656             {
4657               off = TREE_OPERAND (base_address, 1);
4658               base_address = TREE_OPERAND (base_address, 0);
4659             }
4660           STRIP_NOPS (off);
4661           if (TREE_CODE (off) == MULT_EXPR
4662               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4663             {
4664               step = TREE_OPERAND (off, 1);
4665               off = TREE_OPERAND (off, 0);
4666               STRIP_NOPS (off);
4667             }
4668           if (CONVERT_EXPR_P (off)
4669               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4670                   < TYPE_PRECISION (TREE_TYPE (off))))
4671             off = TREE_OPERAND (off, 0);
4672           if (TREE_CODE (off) == SSA_NAME)
4673             {
4674               gimple *def = SSA_NAME_DEF_STMT (off);
4675               /* Look through widening conversion.  */
4676               if (is_gimple_assign (def)
4677                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4678                 {
4679                   tree rhs1 = gimple_assign_rhs1 (def);
4680                   if (TREE_CODE (rhs1) == SSA_NAME
4681                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4682                       && (TYPE_PRECISION (TREE_TYPE (off))
4683                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4684                     def = SSA_NAME_DEF_STMT (rhs1);
4685                 }
4686               if (is_gimple_call (def)
4687                   && gimple_call_internal_p (def)
4688                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4689                 {
4690                   tree arg = gimple_call_arg (def, 0);
4691                   tree reft = TREE_TYPE (DR_REF (newdr));
4692                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4693                   arg = SSA_NAME_VAR (arg);
4694                   if (arg == loop->simduid
4695                       /* For now.  */
4696                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4697                     {
4698                       DR_BASE_ADDRESS (newdr) = base_address;
4699                       DR_OFFSET (newdr) = ssize_int (0);
4700                       DR_STEP (newdr) = step;
4701                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4702                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4703                       /* Mark as simd-lane access.  */
4704                       tree arg2 = gimple_call_arg (def, 1);
4705                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4706                       free_data_ref (dr);
4707                       datarefs->safe_push (newdr);
4708                       if (dataref_groups)
4709                         dataref_groups->safe_push (group_id);
4710                       return opt_result::success ();
4711                     }
4712                 }
4713             }
4714         }
4715       free_data_ref (newdr);
4716     }
4717
4718   datarefs->safe_push (dr);
4719   if (dataref_groups)
4720     dataref_groups->safe_push (group_id);
4721   return opt_result::success ();
4722 }
4723
4724 /* Function vect_analyze_data_refs.
4725
4726   Find all the data references in the loop or basic block.
4727
4728    The general structure of the analysis of data refs in the vectorizer is as
4729    follows:
4730    1- vect_analyze_data_refs(loop/bb): call
4731       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4732       in the loop/bb and their dependences.
4733    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4734    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4735    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4736
4737 */
4738
4739 opt_result
4740 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4741 {
4742   class loop *loop = NULL;
4743   unsigned int i;
4744   struct data_reference *dr;
4745   tree scalar_type;
4746
4747   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4748
4749   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4750     loop = LOOP_VINFO_LOOP (loop_vinfo);
4751
4752   /* Go through the data-refs, check that the analysis succeeded.  Update
4753      pointer from stmt_vec_info struct to DR and vectype.  */
4754
4755   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4756   FOR_EACH_VEC_ELT (datarefs, i, dr)
4757     {
4758       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4759       poly_uint64 vf;
4760
4761       gcc_assert (DR_REF (dr));
4762       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4763       gcc_assert (!stmt_info->dr_aux.dr);
4764       stmt_info->dr_aux.dr = dr;
4765       stmt_info->dr_aux.stmt = stmt_info;
4766
4767       /* Check that analysis of the data-ref succeeded.  */
4768       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4769           || !DR_STEP (dr))
4770         {
4771           bool maybe_gather
4772             = DR_IS_READ (dr)
4773               && !TREE_THIS_VOLATILE (DR_REF (dr));
4774           bool maybe_scatter
4775             = DR_IS_WRITE (dr)
4776               && !TREE_THIS_VOLATILE (DR_REF (dr));
4777
4778           /* If target supports vector gather loads or scatter stores,
4779              see if they can't be used.  */
4780           if (is_a <loop_vec_info> (vinfo)
4781               && !nested_in_vect_loop_p (loop, stmt_info))
4782             {
4783               if (maybe_gather || maybe_scatter)
4784                 {
4785                   if (maybe_gather)
4786                     gatherscatter = GATHER;
4787                   else
4788                     gatherscatter = SCATTER;
4789                 }
4790             }
4791
4792           if (gatherscatter == SG_NONE)
4793             {
4794               if (dump_enabled_p ())
4795                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4796                                  "not vectorized: data ref analysis "
4797                                  "failed %G", stmt_info->stmt);
4798               if (is_a <bb_vec_info> (vinfo))
4799                 {
4800                   /* In BB vectorization the ref can still participate
4801                      in dependence analysis, we just can't vectorize it.  */
4802                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4803                   continue;
4804                 }
4805               return opt_result::failure_at (stmt_info->stmt,
4806                                              "not vectorized:"
4807                                              " data ref analysis failed: %G",
4808                                              stmt_info->stmt);
4809             }
4810         }
4811
4812       /* See if this was detected as SIMD lane access.  */
4813       if (dr->aux == (void *)-1
4814           || dr->aux == (void *)-2
4815           || dr->aux == (void *)-3
4816           || dr->aux == (void *)-4)
4817         {
4818           if (nested_in_vect_loop_p (loop, stmt_info))
4819             return opt_result::failure_at (stmt_info->stmt,
4820                                            "not vectorized:"
4821                                            " data ref analysis failed: %G",
4822                                            stmt_info->stmt);
4823           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4824             = -(uintptr_t) dr->aux;
4825         }
4826
4827       tree base = get_base_address (DR_REF (dr));
4828       if (base && VAR_P (base) && DECL_NONALIASED (base))
4829         {
4830           if (dump_enabled_p ())
4831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4832                              "not vectorized: base object not addressable "
4833                              "for stmt: %G", stmt_info->stmt);
4834           if (is_a <bb_vec_info> (vinfo))
4835             {
4836               /* In BB vectorization the ref can still participate
4837                  in dependence analysis, we just can't vectorize it.  */
4838               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4839               continue;
4840             }
4841           return opt_result::failure_at (stmt_info->stmt,
4842                                          "not vectorized: base object not"
4843                                          " addressable for stmt: %G",
4844                                          stmt_info->stmt);
4845         }
4846
4847       if (is_a <loop_vec_info> (vinfo)
4848           && DR_STEP (dr)
4849           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4850         {
4851           if (nested_in_vect_loop_p (loop, stmt_info))
4852             return opt_result::failure_at (stmt_info->stmt,
4853                                            "not vectorized: "
4854                                            "not suitable for strided load %G",
4855                                            stmt_info->stmt);
4856           STMT_VINFO_STRIDED_P (stmt_info) = true;
4857         }
4858
4859       /* Update DR field in stmt_vec_info struct.  */
4860
4861       /* If the dataref is in an inner-loop of the loop that is considered for
4862          for vectorization, we also want to analyze the access relative to
4863          the outer-loop (DR contains information only relative to the
4864          inner-most enclosing loop).  We do that by building a reference to the
4865          first location accessed by the inner-loop, and analyze it relative to
4866          the outer-loop.  */
4867       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4868         {
4869           /* Build a reference to the first location accessed by the
4870              inner loop: *(BASE + INIT + OFFSET).  By construction,
4871              this address must be invariant in the inner loop, so we
4872              can consider it as being used in the outer loop.  */
4873           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4874           tree offset = unshare_expr (DR_OFFSET (dr));
4875           tree init = unshare_expr (DR_INIT (dr));
4876           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4877                                           init, offset);
4878           tree init_addr = fold_build_pointer_plus (base, init_offset);
4879           tree init_ref = build_fold_indirect_ref (init_addr);
4880
4881           if (dump_enabled_p ())
4882             dump_printf_loc (MSG_NOTE, vect_location,
4883                              "analyze in outer loop: %T\n", init_ref);
4884
4885           opt_result res
4886             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4887                                     init_ref, loop, stmt_info->stmt);
4888           if (!res)
4889             /* dr_analyze_innermost already explained the failure.  */
4890             return res;
4891
4892           if (dump_enabled_p ())
4893             dump_printf_loc (MSG_NOTE, vect_location,
4894                              "\touter base_address: %T\n"
4895                              "\touter offset from base address: %T\n"
4896                              "\touter constant offset from base address: %T\n"
4897                              "\touter step: %T\n"
4898                              "\touter base alignment: %d\n\n"
4899                              "\touter base misalignment: %d\n"
4900                              "\touter offset alignment: %d\n"
4901                              "\touter step alignment: %d\n",
4902                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4903                              STMT_VINFO_DR_OFFSET (stmt_info),
4904                              STMT_VINFO_DR_INIT (stmt_info),
4905                              STMT_VINFO_DR_STEP (stmt_info),
4906                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4907                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4908                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4909                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4910         }
4911
4912       /* Set vectype for STMT.  */
4913       scalar_type = TREE_TYPE (DR_REF (dr));
4914       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4915       if (!vectype)
4916         {
4917           if (dump_enabled_p ())
4918             {
4919               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4920                                "not vectorized: no vectype for stmt: %G",
4921                                stmt_info->stmt);
4922               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4923               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4924                                  scalar_type);
4925               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4926             }
4927
4928           if (is_a <bb_vec_info> (vinfo))
4929             {
4930               /* No vector type is fine, the ref can still participate
4931                  in dependence analysis, we just can't vectorize it.  */
4932               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4933               continue;
4934             }
4935           if (fatal)
4936             *fatal = false;
4937           return opt_result::failure_at (stmt_info->stmt,
4938                                          "not vectorized:"
4939                                          " no vectype for stmt: %G"
4940                                          " scalar_type: %T\n",
4941                                          stmt_info->stmt, scalar_type);
4942         }
4943       else
4944         {
4945           if (dump_enabled_p ())
4946             dump_printf_loc (MSG_NOTE, vect_location,
4947                              "got vectype for stmt: %G%T\n",
4948                              stmt_info->stmt, vectype);
4949         }
4950
4951       /* Adjust the minimal vectorization factor according to the
4952          vector type.  */
4953       vf = TYPE_VECTOR_SUBPARTS (vectype);
4954       *min_vf = upper_bound (*min_vf, vf);
4955
4956       /* Leave the BB vectorizer to pick the vector type later, based on
4957          the final dataref group size and SLP node size.  */
4958       if (is_a <loop_vec_info> (vinfo))
4959         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4960
4961       if (gatherscatter != SG_NONE)
4962         {
4963           gather_scatter_info gs_info;
4964           if (!vect_check_gather_scatter (stmt_info,
4965                                           as_a <loop_vec_info> (vinfo),
4966                                           &gs_info)
4967               || !get_vectype_for_scalar_type (vinfo,
4968                                                TREE_TYPE (gs_info.offset)))
4969             {
4970               if (fatal)
4971                 *fatal = false;
4972               return opt_result::failure_at
4973                         (stmt_info->stmt,
4974                          (gatherscatter == GATHER)
4975                          ? "not vectorized: not suitable for gather load %G"
4976                          : "not vectorized: not suitable for scatter store %G",
4977                          stmt_info->stmt);
4978             }
4979           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4980         }
4981     }
4982
4983   /* We used to stop processing and prune the list here.  Verify we no
4984      longer need to.  */
4985   gcc_assert (i == datarefs.length ());
4986
4987   return opt_result::success ();
4988 }
4989
4990
4991 /* Function vect_get_new_vect_var.
4992
4993    Returns a name for a new variable.  The current naming scheme appends the
4994    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4995    the name of vectorizer generated variables, and appends that to NAME if
4996    provided.  */
4997
4998 tree
4999 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5000 {
5001   const char *prefix;
5002   tree new_vect_var;
5003
5004   switch (var_kind)
5005   {
5006   case vect_simple_var:
5007     prefix = "vect";
5008     break;
5009   case vect_scalar_var:
5010     prefix = "stmp";
5011     break;
5012   case vect_mask_var:
5013     prefix = "mask";
5014     break;
5015   case vect_pointer_var:
5016     prefix = "vectp";
5017     break;
5018   default:
5019     gcc_unreachable ();
5020   }
5021
5022   if (name)
5023     {
5024       char* tmp = concat (prefix, "_", name, NULL);
5025       new_vect_var = create_tmp_reg (type, tmp);
5026       free (tmp);
5027     }
5028   else
5029     new_vect_var = create_tmp_reg (type, prefix);
5030
5031   return new_vect_var;
5032 }
5033
5034 /* Like vect_get_new_vect_var but return an SSA name.  */
5035
5036 tree
5037 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5038 {
5039   const char *prefix;
5040   tree new_vect_var;
5041
5042   switch (var_kind)
5043   {
5044   case vect_simple_var:
5045     prefix = "vect";
5046     break;
5047   case vect_scalar_var:
5048     prefix = "stmp";
5049     break;
5050   case vect_pointer_var:
5051     prefix = "vectp";
5052     break;
5053   default:
5054     gcc_unreachable ();
5055   }
5056
5057   if (name)
5058     {
5059       char* tmp = concat (prefix, "_", name, NULL);
5060       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5061       free (tmp);
5062     }
5063   else
5064     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5065
5066   return new_vect_var;
5067 }
5068
5069 /* Duplicate points-to info on NAME from DR_INFO.  */
5070
5071 static void
5072 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5073 {
5074   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5075   /* DR_PTR_INFO is for a base SSA name, not including constant or
5076      variable offsets in the ref so its alignment info does not apply.  */
5077   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5078 }
5079
5080 /* Function vect_create_addr_base_for_vector_ref.
5081
5082    Create an expression that computes the address of the first memory location
5083    that will be accessed for a data reference.
5084
5085    Input:
5086    STMT_INFO: The statement containing the data reference.
5087    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5088    OFFSET: Optional. If supplied, it is be added to the initial address.
5089    LOOP:    Specify relative to which loop-nest should the address be computed.
5090             For example, when the dataref is in an inner-loop nested in an
5091             outer-loop that is now being vectorized, LOOP can be either the
5092             outer-loop, or the inner-loop.  The first memory location accessed
5093             by the following dataref ('in' points to short):
5094
5095                 for (i=0; i<N; i++)
5096                    for (j=0; j<M; j++)
5097                      s += in[i+j]
5098
5099             is as follows:
5100             if LOOP=i_loop:     &in             (relative to i_loop)
5101             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5102
5103    Output:
5104    1. Return an SSA_NAME whose value is the address of the memory location of
5105       the first vector of the data reference.
5106    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5107       these statement(s) which define the returned SSA_NAME.
5108
5109    FORNOW: We are only handling array accesses with step 1.  */
5110
5111 tree
5112 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5113                                       gimple_seq *new_stmt_list,
5114                                       tree offset)
5115 {
5116   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5117   struct data_reference *dr = dr_info->dr;
5118   const char *base_name;
5119   tree addr_base;
5120   tree dest;
5121   gimple_seq seq = NULL;
5122   tree vect_ptr_type;
5123   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5124   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5125
5126   tree data_ref_base = unshare_expr (drb->base_address);
5127   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5128   tree init = unshare_expr (drb->init);
5129
5130   if (loop_vinfo)
5131     base_name = get_name (data_ref_base);
5132   else
5133     {
5134       base_offset = ssize_int (0);
5135       init = ssize_int (0);
5136       base_name = get_name (DR_REF (dr));
5137     }
5138
5139   /* Create base_offset */
5140   base_offset = size_binop (PLUS_EXPR,
5141                             fold_convert (sizetype, base_offset),
5142                             fold_convert (sizetype, init));
5143
5144   if (offset)
5145     {
5146       offset = fold_convert (sizetype, offset);
5147       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5148                                  base_offset, offset);
5149     }
5150
5151   /* base + base_offset */
5152   if (loop_vinfo)
5153     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5154   else
5155     addr_base = build1 (ADDR_EXPR,
5156                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5157                         /* Strip zero offset components since we don't need
5158                            them and they can confuse late diagnostics if
5159                            we CSE them wrongly.  See PR106904 for example.  */
5160                         unshare_expr (strip_zero_offset_components
5161                                                                 (DR_REF (dr))));
5162
5163   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5164   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5165   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5166   gimple_seq_add_seq (new_stmt_list, seq);
5167
5168   if (DR_PTR_INFO (dr)
5169       && TREE_CODE (addr_base) == SSA_NAME
5170       /* We should only duplicate pointer info to newly created SSA names.  */
5171       && SSA_NAME_VAR (addr_base) == dest)
5172     {
5173       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5174       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5175     }
5176
5177   if (dump_enabled_p ())
5178     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5179
5180   return addr_base;
5181 }
5182
5183
5184 /* Function vect_create_data_ref_ptr.
5185
5186    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5187    location accessed in the loop by STMT_INFO, along with the def-use update
5188    chain to appropriately advance the pointer through the loop iterations.
5189    Also set aliasing information for the pointer.  This pointer is used by
5190    the callers to this function to create a memory reference expression for
5191    vector load/store access.
5192
5193    Input:
5194    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5195          GIMPLE_ASSIGN <name, data-ref> or
5196          GIMPLE_ASSIGN <data-ref, name>.
5197    2. AGGR_TYPE: the type of the reference, which should be either a vector
5198         or an array.
5199    3. AT_LOOP: the loop where the vector memref is to be created.
5200    4. OFFSET (optional): a byte offset to be added to the initial address
5201         accessed by the data-ref in STMT_INFO.
5202    5. BSI: location where the new stmts are to be placed if there is no loop
5203    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5204         pointing to the initial address.
5205    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5206         to the IV during each iteration of the loop.  NULL says to move
5207         by one copy of AGGR_TYPE up or down, depending on the step of the
5208         data reference.
5209
5210    Output:
5211    1. Declare a new ptr to vector_type, and have it point to the base of the
5212       data reference (initial addressed accessed by the data reference).
5213       For example, for vector of type V8HI, the following code is generated:
5214
5215       v8hi *ap;
5216       ap = (v8hi *)initial_address;
5217
5218       if OFFSET is not supplied:
5219          initial_address = &a[init];
5220       if OFFSET is supplied:
5221          initial_address = &a[init] + OFFSET;
5222       if BYTE_OFFSET is supplied:
5223          initial_address = &a[init] + BYTE_OFFSET;
5224
5225       Return the initial_address in INITIAL_ADDRESS.
5226
5227    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5228       update the pointer in each iteration of the loop.
5229
5230       Return the increment stmt that updates the pointer in PTR_INCR.
5231
5232    3. Return the pointer.  */
5233
5234 tree
5235 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5236                           tree aggr_type, class loop *at_loop, tree offset,
5237                           tree *initial_address, gimple_stmt_iterator *gsi,
5238                           gimple **ptr_incr, bool only_init,
5239                           tree iv_step)
5240 {
5241   const char *base_name;
5242   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5243   class loop *loop = NULL;
5244   bool nested_in_vect_loop = false;
5245   class loop *containing_loop = NULL;
5246   tree aggr_ptr_type;
5247   tree aggr_ptr;
5248   tree new_temp;
5249   gimple_seq new_stmt_list = NULL;
5250   edge pe = NULL;
5251   basic_block new_bb;
5252   tree aggr_ptr_init;
5253   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5254   struct data_reference *dr = dr_info->dr;
5255   tree aptr;
5256   gimple_stmt_iterator incr_gsi;
5257   bool insert_after;
5258   tree indx_before_incr, indx_after_incr;
5259   gimple *incr;
5260   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5261
5262   gcc_assert (iv_step != NULL_TREE
5263               || TREE_CODE (aggr_type) == ARRAY_TYPE
5264               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5265
5266   if (loop_vinfo)
5267     {
5268       loop = LOOP_VINFO_LOOP (loop_vinfo);
5269       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5270       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5271       pe = loop_preheader_edge (loop);
5272     }
5273   else
5274     {
5275       gcc_assert (bb_vinfo);
5276       only_init = true;
5277       *ptr_incr = NULL;
5278     }
5279
5280   /* Create an expression for the first address accessed by this load
5281      in LOOP.  */
5282   base_name = get_name (DR_BASE_ADDRESS (dr));
5283
5284   if (dump_enabled_p ())
5285     {
5286       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5287       dump_printf_loc (MSG_NOTE, vect_location,
5288                        "create %s-pointer variable to type: %T",
5289                        get_tree_code_name (TREE_CODE (aggr_type)),
5290                        aggr_type);
5291       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5292         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5293       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5294         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5295       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5296         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5297       else
5298         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5299       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5300     }
5301
5302   /* (1) Create the new aggregate-pointer variable.
5303      Vector and array types inherit the alias set of their component
5304      type by default so we need to use a ref-all pointer if the data
5305      reference does not conflict with the created aggregated data
5306      reference because it is not addressable.  */
5307   bool need_ref_all = false;
5308   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5309                               get_alias_set (DR_REF (dr))))
5310     need_ref_all = true;
5311   /* Likewise for any of the data references in the stmt group.  */
5312   else if (DR_GROUP_SIZE (stmt_info) > 1)
5313     {
5314       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5315       do
5316         {
5317           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5318           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5319                                       get_alias_set (DR_REF (sdr))))
5320             {
5321               need_ref_all = true;
5322               break;
5323             }
5324           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5325         }
5326       while (sinfo);
5327     }
5328   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5329                                                need_ref_all);
5330   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5331
5332
5333   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5334      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5335      def-use update cycles for the pointer: one relative to the outer-loop
5336      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5337      to the inner-loop (which is the inner-most loop containing the dataref),
5338      and this is done be step (5) below.
5339
5340      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5341      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5342      redundant.  Steps (3),(4) create the following:
5343
5344         vp0 = &base_addr;
5345         LOOP:   vp1 = phi(vp0,vp2)
5346                 ...
5347                 ...
5348                 vp2 = vp1 + step
5349                 goto LOOP
5350
5351      If there is an inner-loop nested in loop, then step (5) will also be
5352      applied, and an additional update in the inner-loop will be created:
5353
5354         vp0 = &base_addr;
5355         LOOP:   vp1 = phi(vp0,vp2)
5356                 ...
5357         inner:     vp3 = phi(vp1,vp4)
5358                    vp4 = vp3 + inner_step
5359                    if () goto inner
5360                 ...
5361                 vp2 = vp1 + step
5362                 if () goto LOOP   */
5363
5364   /* (2) Calculate the initial address of the aggregate-pointer, and set
5365      the aggregate-pointer to point to it before the loop.  */
5366
5367   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5368
5369   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5370                                                    stmt_info, &new_stmt_list,
5371                                                    offset);
5372   if (new_stmt_list)
5373     {
5374       if (pe)
5375         {
5376           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5377           gcc_assert (!new_bb);
5378         }
5379       else
5380         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5381     }
5382
5383   *initial_address = new_temp;
5384   aggr_ptr_init = new_temp;
5385
5386   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5387      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5388      inner-loop nested in LOOP (during outer-loop vectorization).  */
5389
5390   /* No update in loop is required.  */
5391   if (only_init && (!loop_vinfo || at_loop == loop))
5392     aptr = aggr_ptr_init;
5393   else
5394     {
5395       /* Accesses to invariant addresses should be handled specially
5396          by the caller.  */
5397       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5398       gcc_assert (!integer_zerop (step));
5399
5400       if (iv_step == NULL_TREE)
5401         {
5402           /* The step of the aggregate pointer is the type size,
5403              negated for downward accesses.  */
5404           iv_step = TYPE_SIZE_UNIT (aggr_type);
5405           if (tree_int_cst_sgn (step) == -1)
5406             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5407         }
5408
5409       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5410
5411       create_iv (aggr_ptr_init, PLUS_EXPR,
5412                  fold_convert (aggr_ptr_type, iv_step),
5413                  aggr_ptr, loop, &incr_gsi, insert_after,
5414                  &indx_before_incr, &indx_after_incr);
5415       incr = gsi_stmt (incr_gsi);
5416
5417       /* Copy the points-to information if it exists. */
5418       if (DR_PTR_INFO (dr))
5419         {
5420           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5421           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5422         }
5423       if (ptr_incr)
5424         *ptr_incr = incr;
5425
5426       aptr = indx_before_incr;
5427     }
5428
5429   if (!nested_in_vect_loop || only_init)
5430     return aptr;
5431
5432
5433   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5434      nested in LOOP, if exists.  */
5435
5436   gcc_assert (nested_in_vect_loop);
5437   if (!only_init)
5438     {
5439       standard_iv_increment_position (containing_loop, &incr_gsi,
5440                                       &insert_after);
5441       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5442                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5443                  &indx_before_incr, &indx_after_incr);
5444       incr = gsi_stmt (incr_gsi);
5445
5446       /* Copy the points-to information if it exists. */
5447       if (DR_PTR_INFO (dr))
5448         {
5449           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5450           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5451         }
5452       if (ptr_incr)
5453         *ptr_incr = incr;
5454
5455       return indx_before_incr;
5456     }
5457   else
5458     gcc_unreachable ();
5459 }
5460
5461
5462 /* Function bump_vector_ptr
5463
5464    Increment a pointer (to a vector type) by vector-size. If requested,
5465    i.e. if PTR-INCR is given, then also connect the new increment stmt
5466    to the existing def-use update-chain of the pointer, by modifying
5467    the PTR_INCR as illustrated below:
5468
5469    The pointer def-use update-chain before this function:
5470                         DATAREF_PTR = phi (p_0, p_2)
5471                         ....
5472         PTR_INCR:       p_2 = DATAREF_PTR + step
5473
5474    The pointer def-use update-chain after this function:
5475                         DATAREF_PTR = phi (p_0, p_2)
5476                         ....
5477                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5478                         ....
5479         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5480
5481    Input:
5482    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5483                  in the loop.
5484    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5485               the loop.  The increment amount across iterations is expected
5486               to be vector_size.
5487    BSI - location where the new update stmt is to be placed.
5488    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5489    BUMP - optional. The offset by which to bump the pointer. If not given,
5490           the offset is assumed to be vector_size.
5491
5492    Output: Return NEW_DATAREF_PTR as illustrated above.
5493
5494 */
5495
5496 tree
5497 bump_vector_ptr (vec_info *vinfo,
5498                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5499                  stmt_vec_info stmt_info, tree bump)
5500 {
5501   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5502   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5503   tree update = TYPE_SIZE_UNIT (vectype);
5504   gimple *incr_stmt;
5505   ssa_op_iter iter;
5506   use_operand_p use_p;
5507   tree new_dataref_ptr;
5508
5509   if (bump)
5510     update = bump;
5511
5512   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5513     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5514   else if (is_gimple_min_invariant (dataref_ptr))
5515     /* When possible avoid emitting a separate increment stmt that will
5516        force the addressed object addressable.  */
5517     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5518                    fold_build2 (MEM_REF,
5519                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5520                                 dataref_ptr,
5521                                 fold_convert (ptr_type_node, update)));
5522   else
5523     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5524   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5525                                    dataref_ptr, update);
5526   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5527   /* Fold the increment, avoiding excessive chains use-def chains of
5528      those, leading to compile-time issues for passes until the next
5529      forwprop pass which would do this as well.  */
5530   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5531   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5532     {
5533       incr_stmt = gsi_stmt (fold_gsi);
5534       update_stmt (incr_stmt);
5535     }
5536
5537   /* Copy the points-to information if it exists. */
5538   if (DR_PTR_INFO (dr))
5539     {
5540       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5541       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5542     }
5543
5544   if (!ptr_incr)
5545     return new_dataref_ptr;
5546
5547   /* Update the vector-pointer's cross-iteration increment.  */
5548   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5549     {
5550       tree use = USE_FROM_PTR (use_p);
5551
5552       if (use == dataref_ptr)
5553         SET_USE (use_p, new_dataref_ptr);
5554       else
5555         gcc_assert (operand_equal_p (use, update, 0));
5556     }
5557
5558   return new_dataref_ptr;
5559 }
5560
5561
5562 /* Copy memory reference info such as base/clique from the SRC reference
5563    to the DEST MEM_REF.  */
5564
5565 void
5566 vect_copy_ref_info (tree dest, tree src)
5567 {
5568   if (TREE_CODE (dest) != MEM_REF)
5569     return;
5570
5571   tree src_base = src;
5572   while (handled_component_p (src_base))
5573     src_base = TREE_OPERAND (src_base, 0);
5574   if (TREE_CODE (src_base) != MEM_REF
5575       && TREE_CODE (src_base) != TARGET_MEM_REF)
5576     return;
5577
5578   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5579   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5580 }
5581
5582
5583 /* Function vect_create_destination_var.
5584
5585    Create a new temporary of type VECTYPE.  */
5586
5587 tree
5588 vect_create_destination_var (tree scalar_dest, tree vectype)
5589 {
5590   tree vec_dest;
5591   const char *name;
5592   char *new_name;
5593   tree type;
5594   enum vect_var_kind kind;
5595
5596   kind = vectype
5597     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5598     ? vect_mask_var
5599     : vect_simple_var
5600     : vect_scalar_var;
5601   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5602
5603   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5604
5605   name = get_name (scalar_dest);
5606   if (name)
5607     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5608   else
5609     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5610   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5611   free (new_name);
5612
5613   return vec_dest;
5614 }
5615
5616 /* Function vect_grouped_store_supported.
5617
5618    Returns TRUE if interleave high and interleave low permutations
5619    are supported, and FALSE otherwise.  */
5620
5621 bool
5622 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5623 {
5624   machine_mode mode = TYPE_MODE (vectype);
5625
5626   /* vect_permute_store_chain requires the group size to be equal to 3 or
5627      be a power of two.  */
5628   if (count != 3 && exact_log2 (count) == -1)
5629     {
5630       if (dump_enabled_p ())
5631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632                          "the size of the group of accesses"
5633                          " is not a power of 2 or not eqaul to 3\n");
5634       return false;
5635     }
5636
5637   /* Check that the permutation is supported.  */
5638   if (VECTOR_MODE_P (mode))
5639     {
5640       unsigned int i;
5641       if (count == 3)
5642         {
5643           unsigned int j0 = 0, j1 = 0, j2 = 0;
5644           unsigned int i, j;
5645
5646           unsigned int nelt;
5647           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5648             {
5649               if (dump_enabled_p ())
5650                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5651                                  "cannot handle groups of 3 stores for"
5652                                  " variable-length vectors\n");
5653               return false;
5654             }
5655
5656           vec_perm_builder sel (nelt, nelt, 1);
5657           sel.quick_grow (nelt);
5658           vec_perm_indices indices;
5659           for (j = 0; j < 3; j++)
5660             {
5661               int nelt0 = ((3 - j) * nelt) % 3;
5662               int nelt1 = ((3 - j) * nelt + 1) % 3;
5663               int nelt2 = ((3 - j) * nelt + 2) % 3;
5664               for (i = 0; i < nelt; i++)
5665                 {
5666                   if (3 * i + nelt0 < nelt)
5667                     sel[3 * i + nelt0] = j0++;
5668                   if (3 * i + nelt1 < nelt)
5669                     sel[3 * i + nelt1] = nelt + j1++;
5670                   if (3 * i + nelt2 < nelt)
5671                     sel[3 * i + nelt2] = 0;
5672                 }
5673               indices.new_vector (sel, 2, nelt);
5674               if (!can_vec_perm_const_p (mode, mode, indices))
5675                 {
5676                   if (dump_enabled_p ())
5677                     dump_printf (MSG_MISSED_OPTIMIZATION,
5678                                  "permutation op not supported by target.\n");
5679                   return false;
5680                 }
5681
5682               for (i = 0; i < nelt; i++)
5683                 {
5684                   if (3 * i + nelt0 < nelt)
5685                     sel[3 * i + nelt0] = 3 * i + nelt0;
5686                   if (3 * i + nelt1 < nelt)
5687                     sel[3 * i + nelt1] = 3 * i + nelt1;
5688                   if (3 * i + nelt2 < nelt)
5689                     sel[3 * i + nelt2] = nelt + j2++;
5690                 }
5691               indices.new_vector (sel, 2, nelt);
5692               if (!can_vec_perm_const_p (mode, mode, indices))
5693                 {
5694                   if (dump_enabled_p ())
5695                     dump_printf (MSG_MISSED_OPTIMIZATION,
5696                                  "permutation op not supported by target.\n");
5697                   return false;
5698                 }
5699             }
5700           return true;
5701         }
5702       else
5703         {
5704           /* If length is not equal to 3 then only power of 2 is supported.  */
5705           gcc_assert (pow2p_hwi (count));
5706           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5707
5708           /* The encoding has 2 interleaved stepped patterns.  */
5709           if(!multiple_p (nelt, 2))
5710             return false;
5711           vec_perm_builder sel (nelt, 2, 3);
5712           sel.quick_grow (6);
5713           for (i = 0; i < 3; i++)
5714             {
5715               sel[i * 2] = i;
5716               sel[i * 2 + 1] = i + nelt;
5717             }
5718           vec_perm_indices indices (sel, 2, nelt);
5719           if (can_vec_perm_const_p (mode, mode, indices))
5720             {
5721               for (i = 0; i < 6; i++)
5722                 sel[i] += exact_div (nelt, 2);
5723               indices.new_vector (sel, 2, nelt);
5724               if (can_vec_perm_const_p (mode, mode, indices))
5725                 return true;
5726             }
5727         }
5728     }
5729
5730   if (dump_enabled_p ())
5731     dump_printf (MSG_MISSED_OPTIMIZATION,
5732                  "permutation op not supported by target.\n");
5733   return false;
5734 }
5735
5736 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5737    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5738
5739 internal_fn
5740 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5741                             bool masked_p)
5742 {
5743   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5744                                     vec_mask_len_store_lanes_optab, vectype,
5745                                     count))
5746     return IFN_MASK_LEN_STORE_LANES;
5747   else if (masked_p)
5748     {
5749       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5750                                         vec_mask_store_lanes_optab, vectype,
5751                                         count))
5752         return IFN_MASK_STORE_LANES;
5753     }
5754   else
5755     {
5756       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5757                                         vec_store_lanes_optab, vectype, count))
5758         return IFN_STORE_LANES;
5759     }
5760   return IFN_LAST;
5761 }
5762
5763
5764 /* Function vect_permute_store_chain.
5765
5766    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5767    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5768    the data correctly for the stores.  Return the final references for stores
5769    in RESULT_CHAIN.
5770
5771    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5772    The input is 4 vectors each containing 8 elements.  We assign a number to
5773    each element, the input sequence is:
5774
5775    1st vec:   0  1  2  3  4  5  6  7
5776    2nd vec:   8  9 10 11 12 13 14 15
5777    3rd vec:  16 17 18 19 20 21 22 23
5778    4th vec:  24 25 26 27 28 29 30 31
5779
5780    The output sequence should be:
5781
5782    1st vec:  0  8 16 24  1  9 17 25
5783    2nd vec:  2 10 18 26  3 11 19 27
5784    3rd vec:  4 12 20 28  5 13 21 30
5785    4th vec:  6 14 22 30  7 15 23 31
5786
5787    i.e., we interleave the contents of the four vectors in their order.
5788
5789    We use interleave_high/low instructions to create such output.  The input of
5790    each interleave_high/low operation is two vectors:
5791    1st vec    2nd vec
5792    0 1 2 3    4 5 6 7
5793    the even elements of the result vector are obtained left-to-right from the
5794    high/low elements of the first vector.  The odd elements of the result are
5795    obtained left-to-right from the high/low elements of the second vector.
5796    The output of interleave_high will be:   0 4 1 5
5797    and of interleave_low:                   2 6 3 7
5798
5799
5800    The permutation is done in log LENGTH stages.  In each stage interleave_high
5801    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5802    where the first argument is taken from the first half of DR_CHAIN and the
5803    second argument from it's second half.
5804    In our example,
5805
5806    I1: interleave_high (1st vec, 3rd vec)
5807    I2: interleave_low (1st vec, 3rd vec)
5808    I3: interleave_high (2nd vec, 4th vec)
5809    I4: interleave_low (2nd vec, 4th vec)
5810
5811    The output for the first stage is:
5812
5813    I1:  0 16  1 17  2 18  3 19
5814    I2:  4 20  5 21  6 22  7 23
5815    I3:  8 24  9 25 10 26 11 27
5816    I4: 12 28 13 29 14 30 15 31
5817
5818    The output of the second stage, i.e. the final result is:
5819
5820    I1:  0  8 16 24  1  9 17 25
5821    I2:  2 10 18 26  3 11 19 27
5822    I3:  4 12 20 28  5 13 21 30
5823    I4:  6 14 22 30  7 15 23 31.  */
5824
5825 void
5826 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5827                           unsigned int length,
5828                           stmt_vec_info stmt_info,
5829                           gimple_stmt_iterator *gsi,
5830                           vec<tree> *result_chain)
5831 {
5832   tree vect1, vect2, high, low;
5833   gimple *perm_stmt;
5834   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5835   tree perm_mask_low, perm_mask_high;
5836   tree data_ref;
5837   tree perm3_mask_low, perm3_mask_high;
5838   unsigned int i, j, n, log_length = exact_log2 (length);
5839
5840   result_chain->quick_grow (length);
5841   memcpy (result_chain->address (), dr_chain.address (),
5842           length * sizeof (tree));
5843
5844   if (length == 3)
5845     {
5846       /* vect_grouped_store_supported ensures that this is constant.  */
5847       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5848       unsigned int j0 = 0, j1 = 0, j2 = 0;
5849
5850       vec_perm_builder sel (nelt, nelt, 1);
5851       sel.quick_grow (nelt);
5852       vec_perm_indices indices;
5853       for (j = 0; j < 3; j++)
5854         {
5855           int nelt0 = ((3 - j) * nelt) % 3;
5856           int nelt1 = ((3 - j) * nelt + 1) % 3;
5857           int nelt2 = ((3 - j) * nelt + 2) % 3;
5858
5859           for (i = 0; i < nelt; i++)
5860             {
5861               if (3 * i + nelt0 < nelt)
5862                 sel[3 * i + nelt0] = j0++;
5863               if (3 * i + nelt1 < nelt)
5864                 sel[3 * i + nelt1] = nelt + j1++;
5865               if (3 * i + nelt2 < nelt)
5866                 sel[3 * i + nelt2] = 0;
5867             }
5868           indices.new_vector (sel, 2, nelt);
5869           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5870
5871           for (i = 0; i < nelt; i++)
5872             {
5873               if (3 * i + nelt0 < nelt)
5874                 sel[3 * i + nelt0] = 3 * i + nelt0;
5875               if (3 * i + nelt1 < nelt)
5876                 sel[3 * i + nelt1] = 3 * i + nelt1;
5877               if (3 * i + nelt2 < nelt)
5878                 sel[3 * i + nelt2] = nelt + j2++;
5879             }
5880           indices.new_vector (sel, 2, nelt);
5881           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5882
5883           vect1 = dr_chain[0];
5884           vect2 = dr_chain[1];
5885
5886           /* Create interleaving stmt:
5887              low = VEC_PERM_EXPR <vect1, vect2,
5888                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5889                                    j + 2, nelt + j + 2, *, ...}>  */
5890           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5891           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5892                                            vect2, perm3_mask_low);
5893           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5894
5895           vect1 = data_ref;
5896           vect2 = dr_chain[2];
5897           /* Create interleaving stmt:
5898              low = VEC_PERM_EXPR <vect1, vect2,
5899                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5900                                    6, 7, nelt + j + 2, ...}>  */
5901           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5902           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5903                                            vect2, perm3_mask_high);
5904           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5905           (*result_chain)[j] = data_ref;
5906         }
5907     }
5908   else
5909     {
5910       /* If length is not equal to 3 then only power of 2 is supported.  */
5911       gcc_assert (pow2p_hwi (length));
5912
5913       /* The encoding has 2 interleaved stepped patterns.  */
5914       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5915       vec_perm_builder sel (nelt, 2, 3);
5916       sel.quick_grow (6);
5917       for (i = 0; i < 3; i++)
5918         {
5919           sel[i * 2] = i;
5920           sel[i * 2 + 1] = i + nelt;
5921         }
5922         vec_perm_indices indices (sel, 2, nelt);
5923         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5924
5925         for (i = 0; i < 6; i++)
5926           sel[i] += exact_div (nelt, 2);
5927         indices.new_vector (sel, 2, nelt);
5928         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5929
5930         for (i = 0, n = log_length; i < n; i++)
5931           {
5932             for (j = 0; j < length/2; j++)
5933               {
5934                 vect1 = dr_chain[j];
5935                 vect2 = dr_chain[j+length/2];
5936
5937                 /* Create interleaving stmt:
5938                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5939                                                         ...}>  */
5940                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5941                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5942                                                  vect2, perm_mask_high);
5943                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5944                 (*result_chain)[2*j] = high;
5945
5946                 /* Create interleaving stmt:
5947                    low = VEC_PERM_EXPR <vect1, vect2,
5948                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5949                                          ...}>  */
5950                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5951                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5952                                                  vect2, perm_mask_low);
5953                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5954                 (*result_chain)[2*j+1] = low;
5955               }
5956             memcpy (dr_chain.address (), result_chain->address (),
5957                     length * sizeof (tree));
5958           }
5959     }
5960 }
5961
5962 /* Function vect_setup_realignment
5963
5964    This function is called when vectorizing an unaligned load using
5965    the dr_explicit_realign[_optimized] scheme.
5966    This function generates the following code at the loop prolog:
5967
5968       p = initial_addr;
5969    x  msq_init = *(floor(p));   # prolog load
5970       realignment_token = call target_builtin;
5971     loop:
5972    x  msq = phi (msq_init, ---)
5973
5974    The stmts marked with x are generated only for the case of
5975    dr_explicit_realign_optimized.
5976
5977    The code above sets up a new (vector) pointer, pointing to the first
5978    location accessed by STMT_INFO, and a "floor-aligned" load using that
5979    pointer.  It also generates code to compute the "realignment-token"
5980    (if the relevant target hook was defined), and creates a phi-node at the
5981    loop-header bb whose arguments are the result of the prolog-load (created
5982    by this function) and the result of a load that takes place in the loop
5983    (to be created by the caller to this function).
5984
5985    For the case of dr_explicit_realign_optimized:
5986    The caller to this function uses the phi-result (msq) to create the
5987    realignment code inside the loop, and sets up the missing phi argument,
5988    as follows:
5989     loop:
5990       msq = phi (msq_init, lsq)
5991       lsq = *(floor(p'));        # load in loop
5992       result = realign_load (msq, lsq, realignment_token);
5993
5994    For the case of dr_explicit_realign:
5995     loop:
5996       msq = *(floor(p));        # load in loop
5997       p' = p + (VS-1);
5998       lsq = *(floor(p'));       # load in loop
5999       result = realign_load (msq, lsq, realignment_token);
6000
6001    Input:
6002    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6003                a memory location that may be unaligned.
6004    BSI - place where new code is to be inserted.
6005    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6006                               is used.
6007
6008    Output:
6009    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6010                        target hook, if defined.
6011    Return value - the result of the loop-header phi node.  */
6012
6013 tree
6014 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6015                         gimple_stmt_iterator *gsi, tree *realignment_token,
6016                         enum dr_alignment_support alignment_support_scheme,
6017                         tree init_addr,
6018                         class loop **at_loop)
6019 {
6020   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6021   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6022   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6023   struct data_reference *dr = dr_info->dr;
6024   class loop *loop = NULL;
6025   edge pe = NULL;
6026   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6027   tree vec_dest;
6028   gimple *inc;
6029   tree ptr;
6030   tree data_ref;
6031   basic_block new_bb;
6032   tree msq_init = NULL_TREE;
6033   tree new_temp;
6034   gphi *phi_stmt;
6035   tree msq = NULL_TREE;
6036   gimple_seq stmts = NULL;
6037   bool compute_in_loop = false;
6038   bool nested_in_vect_loop = false;
6039   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6040   class loop *loop_for_initial_load = NULL;
6041
6042   if (loop_vinfo)
6043     {
6044       loop = LOOP_VINFO_LOOP (loop_vinfo);
6045       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6046     }
6047
6048   gcc_assert (alignment_support_scheme == dr_explicit_realign
6049               || alignment_support_scheme == dr_explicit_realign_optimized);
6050
6051   /* We need to generate three things:
6052      1. the misalignment computation
6053      2. the extra vector load (for the optimized realignment scheme).
6054      3. the phi node for the two vectors from which the realignment is
6055       done (for the optimized realignment scheme).  */
6056
6057   /* 1. Determine where to generate the misalignment computation.
6058
6059      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6060      calculation will be generated by this function, outside the loop (in the
6061      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6062      caller, inside the loop.
6063
6064      Background: If the misalignment remains fixed throughout the iterations of
6065      the loop, then both realignment schemes are applicable, and also the
6066      misalignment computation can be done outside LOOP.  This is because we are
6067      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6068      are a multiple of VS (the Vector Size), and therefore the misalignment in
6069      different vectorized LOOP iterations is always the same.
6070      The problem arises only if the memory access is in an inner-loop nested
6071      inside LOOP, which is now being vectorized using outer-loop vectorization.
6072      This is the only case when the misalignment of the memory access may not
6073      remain fixed throughout the iterations of the inner-loop (as explained in
6074      detail in vect_supportable_dr_alignment).  In this case, not only is the
6075      optimized realignment scheme not applicable, but also the misalignment
6076      computation (and generation of the realignment token that is passed to
6077      REALIGN_LOAD) have to be done inside the loop.
6078
6079      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6080      or not, which in turn determines if the misalignment is computed inside
6081      the inner-loop, or outside LOOP.  */
6082
6083   if (init_addr != NULL_TREE || !loop_vinfo)
6084     {
6085       compute_in_loop = true;
6086       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6087     }
6088
6089
6090   /* 2. Determine where to generate the extra vector load.
6091
6092      For the optimized realignment scheme, instead of generating two vector
6093      loads in each iteration, we generate a single extra vector load in the
6094      preheader of the loop, and in each iteration reuse the result of the
6095      vector load from the previous iteration.  In case the memory access is in
6096      an inner-loop nested inside LOOP, which is now being vectorized using
6097      outer-loop vectorization, we need to determine whether this initial vector
6098      load should be generated at the preheader of the inner-loop, or can be
6099      generated at the preheader of LOOP.  If the memory access has no evolution
6100      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6101      to be generated inside LOOP (in the preheader of the inner-loop).  */
6102
6103   if (nested_in_vect_loop)
6104     {
6105       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6106       bool invariant_in_outerloop =
6107             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6108       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6109     }
6110   else
6111     loop_for_initial_load = loop;
6112   if (at_loop)
6113     *at_loop = loop_for_initial_load;
6114
6115   tree vuse = NULL_TREE;
6116   if (loop_for_initial_load)
6117     {
6118       pe = loop_preheader_edge (loop_for_initial_load);
6119       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6120         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6121     }
6122   if (!vuse)
6123     vuse = gimple_vuse (gsi_stmt (*gsi));
6124
6125   /* 3. For the case of the optimized realignment, create the first vector
6126       load at the loop preheader.  */
6127
6128   if (alignment_support_scheme == dr_explicit_realign_optimized)
6129     {
6130       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6131       gassign *new_stmt;
6132
6133       gcc_assert (!compute_in_loop);
6134       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6135       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6136                                       loop_for_initial_load, NULL_TREE,
6137                                       &init_addr, NULL, &inc, true);
6138       if (TREE_CODE (ptr) == SSA_NAME)
6139         new_temp = copy_ssa_name (ptr);
6140       else
6141         new_temp = make_ssa_name (TREE_TYPE (ptr));
6142       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6143       tree type = TREE_TYPE (ptr);
6144       new_stmt = gimple_build_assign
6145                    (new_temp, BIT_AND_EXPR, ptr,
6146                     fold_build2 (MINUS_EXPR, type,
6147                                  build_int_cst (type, 0),
6148                                  build_int_cst (type, align)));
6149       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6150       gcc_assert (!new_bb);
6151       data_ref
6152         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6153                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6154       vect_copy_ref_info (data_ref, DR_REF (dr));
6155       new_stmt = gimple_build_assign (vec_dest, data_ref);
6156       new_temp = make_ssa_name (vec_dest, new_stmt);
6157       gimple_assign_set_lhs (new_stmt, new_temp);
6158       gimple_set_vuse (new_stmt, vuse);
6159       if (pe)
6160         {
6161           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6162           gcc_assert (!new_bb);
6163         }
6164       else
6165          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6166
6167       msq_init = gimple_assign_lhs (new_stmt);
6168     }
6169
6170   /* 4. Create realignment token using a target builtin, if available.
6171       It is done either inside the containing loop, or before LOOP (as
6172       determined above).  */
6173
6174   if (targetm.vectorize.builtin_mask_for_load)
6175     {
6176       gcall *new_stmt;
6177       tree builtin_decl;
6178
6179       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6180       if (!init_addr)
6181         {
6182           /* Generate the INIT_ADDR computation outside LOOP.  */
6183           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6184                                                             stmt_info, &stmts,
6185                                                             NULL_TREE);
6186           if (loop)
6187             {
6188               pe = loop_preheader_edge (loop);
6189               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6190               gcc_assert (!new_bb);
6191             }
6192           else
6193              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6194         }
6195
6196       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6197       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6198       vec_dest =
6199         vect_create_destination_var (scalar_dest,
6200                                      gimple_call_return_type (new_stmt));
6201       new_temp = make_ssa_name (vec_dest, new_stmt);
6202       gimple_call_set_lhs (new_stmt, new_temp);
6203
6204       if (compute_in_loop)
6205         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6206       else
6207         {
6208           /* Generate the misalignment computation outside LOOP.  */
6209           pe = loop_preheader_edge (loop);
6210           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6211           gcc_assert (!new_bb);
6212         }
6213
6214       *realignment_token = gimple_call_lhs (new_stmt);
6215
6216       /* The result of the CALL_EXPR to this builtin is determined from
6217          the value of the parameter and no global variables are touched
6218          which makes the builtin a "const" function.  Requiring the
6219          builtin to have the "const" attribute makes it unnecessary
6220          to call mark_call_clobbered.  */
6221       gcc_assert (TREE_READONLY (builtin_decl));
6222     }
6223
6224   if (alignment_support_scheme == dr_explicit_realign)
6225     return msq;
6226
6227   gcc_assert (!compute_in_loop);
6228   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6229
6230
6231   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6232
6233   pe = loop_preheader_edge (containing_loop);
6234   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6235   msq = make_ssa_name (vec_dest);
6236   phi_stmt = create_phi_node (msq, containing_loop->header);
6237   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6238
6239   return msq;
6240 }
6241
6242
6243 /* Function vect_grouped_load_supported.
6244
6245    COUNT is the size of the load group (the number of statements plus the
6246    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6247    only one statement, with a gap of COUNT - 1.
6248
6249    Returns true if a suitable permute exists.  */
6250
6251 bool
6252 vect_grouped_load_supported (tree vectype, bool single_element_p,
6253                              unsigned HOST_WIDE_INT count)
6254 {
6255   machine_mode mode = TYPE_MODE (vectype);
6256
6257   /* If this is single-element interleaving with an element distance
6258      that leaves unused vector loads around punt - we at least create
6259      very sub-optimal code in that case (and blow up memory,
6260      see PR65518).  */
6261   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6262     {
6263       if (dump_enabled_p ())
6264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265                          "single-element interleaving not supported "
6266                          "for not adjacent vector loads\n");
6267       return false;
6268     }
6269
6270   /* vect_permute_load_chain requires the group size to be equal to 3 or
6271      be a power of two.  */
6272   if (count != 3 && exact_log2 (count) == -1)
6273     {
6274       if (dump_enabled_p ())
6275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6276                          "the size of the group of accesses"
6277                          " is not a power of 2 or not equal to 3\n");
6278       return false;
6279     }
6280
6281   /* Check that the permutation is supported.  */
6282   if (VECTOR_MODE_P (mode))
6283     {
6284       unsigned int i, j;
6285       if (count == 3)
6286         {
6287           unsigned int nelt;
6288           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6289             {
6290               if (dump_enabled_p ())
6291                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292                                  "cannot handle groups of 3 loads for"
6293                                  " variable-length vectors\n");
6294               return false;
6295             }
6296
6297           vec_perm_builder sel (nelt, nelt, 1);
6298           sel.quick_grow (nelt);
6299           vec_perm_indices indices;
6300           unsigned int k;
6301           for (k = 0; k < 3; k++)
6302             {
6303               for (i = 0; i < nelt; i++)
6304                 if (3 * i + k < 2 * nelt)
6305                   sel[i] = 3 * i + k;
6306                 else
6307                   sel[i] = 0;
6308               indices.new_vector (sel, 2, nelt);
6309               if (!can_vec_perm_const_p (mode, mode, indices))
6310                 {
6311                   if (dump_enabled_p ())
6312                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6313                                      "shuffle of 3 loads is not supported by"
6314                                      " target\n");
6315                   return false;
6316                 }
6317               for (i = 0, j = 0; i < nelt; i++)
6318                 if (3 * i + k < 2 * nelt)
6319                   sel[i] = i;
6320                 else
6321                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6322               indices.new_vector (sel, 2, nelt);
6323               if (!can_vec_perm_const_p (mode, mode, indices))
6324                 {
6325                   if (dump_enabled_p ())
6326                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6327                                      "shuffle of 3 loads is not supported by"
6328                                      " target\n");
6329                   return false;
6330                 }
6331             }
6332           return true;
6333         }
6334       else
6335         {
6336           /* If length is not equal to 3 then only power of 2 is supported.  */
6337           gcc_assert (pow2p_hwi (count));
6338           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6339
6340           /* The encoding has a single stepped pattern.  */
6341           vec_perm_builder sel (nelt, 1, 3);
6342           sel.quick_grow (3);
6343           for (i = 0; i < 3; i++)
6344             sel[i] = i * 2;
6345           vec_perm_indices indices (sel, 2, nelt);
6346           if (can_vec_perm_const_p (mode, mode, indices))
6347             {
6348               for (i = 0; i < 3; i++)
6349                 sel[i] = i * 2 + 1;
6350               indices.new_vector (sel, 2, nelt);
6351               if (can_vec_perm_const_p (mode, mode, indices))
6352                 return true;
6353             }
6354         }
6355     }
6356
6357   if (dump_enabled_p ())
6358     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6359                      "extract even/odd not supported by target\n");
6360   return false;
6361 }
6362
6363 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6364    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6365
6366 internal_fn
6367 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6368                            bool masked_p)
6369 {
6370   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6371                                     vec_mask_len_load_lanes_optab, vectype,
6372                                     count))
6373     return IFN_MASK_LEN_LOAD_LANES;
6374   else if (masked_p)
6375     {
6376       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6377                                         vec_mask_load_lanes_optab, vectype,
6378                                         count))
6379         return IFN_MASK_LOAD_LANES;
6380     }
6381   else
6382     {
6383       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6384                                         vectype, count))
6385         return IFN_LOAD_LANES;
6386     }
6387   return IFN_LAST;
6388 }
6389
6390 /* Function vect_permute_load_chain.
6391
6392    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6393    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6394    the input data correctly.  Return the final references for loads in
6395    RESULT_CHAIN.
6396
6397    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6398    The input is 4 vectors each containing 8 elements. We assign a number to each
6399    element, the input sequence is:
6400
6401    1st vec:   0  1  2  3  4  5  6  7
6402    2nd vec:   8  9 10 11 12 13 14 15
6403    3rd vec:  16 17 18 19 20 21 22 23
6404    4th vec:  24 25 26 27 28 29 30 31
6405
6406    The output sequence should be:
6407
6408    1st vec:  0 4  8 12 16 20 24 28
6409    2nd vec:  1 5  9 13 17 21 25 29
6410    3rd vec:  2 6 10 14 18 22 26 30
6411    4th vec:  3 7 11 15 19 23 27 31
6412
6413    i.e., the first output vector should contain the first elements of each
6414    interleaving group, etc.
6415
6416    We use extract_even/odd instructions to create such output.  The input of
6417    each extract_even/odd operation is two vectors
6418    1st vec    2nd vec
6419    0 1 2 3    4 5 6 7
6420
6421    and the output is the vector of extracted even/odd elements.  The output of
6422    extract_even will be:   0 2 4 6
6423    and of extract_odd:     1 3 5 7
6424
6425
6426    The permutation is done in log LENGTH stages.  In each stage extract_even
6427    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6428    their order.  In our example,
6429
6430    E1: extract_even (1st vec, 2nd vec)
6431    E2: extract_odd (1st vec, 2nd vec)
6432    E3: extract_even (3rd vec, 4th vec)
6433    E4: extract_odd (3rd vec, 4th vec)
6434
6435    The output for the first stage will be:
6436
6437    E1:  0  2  4  6  8 10 12 14
6438    E2:  1  3  5  7  9 11 13 15
6439    E3: 16 18 20 22 24 26 28 30
6440    E4: 17 19 21 23 25 27 29 31
6441
6442    In order to proceed and create the correct sequence for the next stage (or
6443    for the correct output, if the second stage is the last one, as in our
6444    example), we first put the output of extract_even operation and then the
6445    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6446    The input for the second stage is:
6447
6448    1st vec (E1):  0  2  4  6  8 10 12 14
6449    2nd vec (E3): 16 18 20 22 24 26 28 30
6450    3rd vec (E2):  1  3  5  7  9 11 13 15
6451    4th vec (E4): 17 19 21 23 25 27 29 31
6452
6453    The output of the second stage:
6454
6455    E1: 0 4  8 12 16 20 24 28
6456    E2: 2 6 10 14 18 22 26 30
6457    E3: 1 5  9 13 17 21 25 29
6458    E4: 3 7 11 15 19 23 27 31
6459
6460    And RESULT_CHAIN after reordering:
6461
6462    1st vec (E1):  0 4  8 12 16 20 24 28
6463    2nd vec (E3):  1 5  9 13 17 21 25 29
6464    3rd vec (E2):  2 6 10 14 18 22 26 30
6465    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6466
6467 static void
6468 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6469                          unsigned int length,
6470                          stmt_vec_info stmt_info,
6471                          gimple_stmt_iterator *gsi,
6472                          vec<tree> *result_chain)
6473 {
6474   tree data_ref, first_vect, second_vect;
6475   tree perm_mask_even, perm_mask_odd;
6476   tree perm3_mask_low, perm3_mask_high;
6477   gimple *perm_stmt;
6478   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6479   unsigned int i, j, log_length = exact_log2 (length);
6480
6481   result_chain->quick_grow (length);
6482   memcpy (result_chain->address (), dr_chain.address (),
6483           length * sizeof (tree));
6484
6485   if (length == 3)
6486     {
6487       /* vect_grouped_load_supported ensures that this is constant.  */
6488       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6489       unsigned int k;
6490
6491       vec_perm_builder sel (nelt, nelt, 1);
6492       sel.quick_grow (nelt);
6493       vec_perm_indices indices;
6494       for (k = 0; k < 3; k++)
6495         {
6496           for (i = 0; i < nelt; i++)
6497             if (3 * i + k < 2 * nelt)
6498               sel[i] = 3 * i + k;
6499             else
6500               sel[i] = 0;
6501           indices.new_vector (sel, 2, nelt);
6502           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6503
6504           for (i = 0, j = 0; i < nelt; i++)
6505             if (3 * i + k < 2 * nelt)
6506               sel[i] = i;
6507             else
6508               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6509           indices.new_vector (sel, 2, nelt);
6510           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6511
6512           first_vect = dr_chain[0];
6513           second_vect = dr_chain[1];
6514
6515           /* Create interleaving stmt (low part of):
6516              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6517                                                              ...}>  */
6518           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6519           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6520                                            second_vect, perm3_mask_low);
6521           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6522
6523           /* Create interleaving stmt (high part of):
6524              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6525                                                               ...}>  */
6526           first_vect = data_ref;
6527           second_vect = dr_chain[2];
6528           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6529           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6530                                            second_vect, perm3_mask_high);
6531           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6532           (*result_chain)[k] = data_ref;
6533         }
6534     }
6535   else
6536     {
6537       /* If length is not equal to 3 then only power of 2 is supported.  */
6538       gcc_assert (pow2p_hwi (length));
6539
6540       /* The encoding has a single stepped pattern.  */
6541       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6542       vec_perm_builder sel (nelt, 1, 3);
6543       sel.quick_grow (3);
6544       for (i = 0; i < 3; ++i)
6545         sel[i] = i * 2;
6546       vec_perm_indices indices (sel, 2, nelt);
6547       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6548
6549       for (i = 0; i < 3; ++i)
6550         sel[i] = i * 2 + 1;
6551       indices.new_vector (sel, 2, nelt);
6552       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6553
6554       for (i = 0; i < log_length; i++)
6555         {
6556           for (j = 0; j < length; j += 2)
6557             {
6558               first_vect = dr_chain[j];
6559               second_vect = dr_chain[j+1];
6560
6561               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6562               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6563               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6564                                                first_vect, second_vect,
6565                                                perm_mask_even);
6566               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6567               (*result_chain)[j/2] = data_ref;
6568
6569               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6570               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6571               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6572                                                first_vect, second_vect,
6573                                                perm_mask_odd);
6574               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6575               (*result_chain)[j/2+length/2] = data_ref;
6576             }
6577           memcpy (dr_chain.address (), result_chain->address (),
6578                   length * sizeof (tree));
6579         }
6580     }
6581 }
6582
6583 /* Function vect_shift_permute_load_chain.
6584
6585    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6586    sequence of stmts to reorder the input data accordingly.
6587    Return the final references for loads in RESULT_CHAIN.
6588    Return true if successed, false otherwise.
6589
6590    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6591    The input is 3 vectors each containing 8 elements.  We assign a
6592    number to each element, the input sequence is:
6593
6594    1st vec:   0  1  2  3  4  5  6  7
6595    2nd vec:   8  9 10 11 12 13 14 15
6596    3rd vec:  16 17 18 19 20 21 22 23
6597
6598    The output sequence should be:
6599
6600    1st vec:  0 3 6  9 12 15 18 21
6601    2nd vec:  1 4 7 10 13 16 19 22
6602    3rd vec:  2 5 8 11 14 17 20 23
6603
6604    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6605
6606    First we shuffle all 3 vectors to get correct elements order:
6607
6608    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6609    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6610    3rd vec:  (16 19 22) (17 20 23) (18 21)
6611
6612    Next we unite and shift vector 3 times:
6613
6614    1st step:
6615      shift right by 6 the concatenation of:
6616      "1st vec" and  "2nd vec"
6617        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6618      "2nd vec" and  "3rd vec"
6619        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6620      "3rd vec" and  "1st vec"
6621        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6622                              | New vectors                   |
6623
6624      So that now new vectors are:
6625
6626      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6627      2nd vec:  (10 13) (16 19 22) (17 20 23)
6628      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6629
6630    2nd step:
6631      shift right by 5 the concatenation of:
6632      "1st vec" and  "3rd vec"
6633        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6634      "2nd vec" and  "1st vec"
6635        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6636      "3rd vec" and  "2nd vec"
6637        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6638                           | New vectors                   |
6639
6640      So that now new vectors are:
6641
6642      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6643      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6644      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6645
6646    3rd step:
6647      shift right by 5 the concatenation of:
6648      "1st vec" and  "1st vec"
6649        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6650      shift right by 3 the concatenation of:
6651      "2nd vec" and  "2nd vec"
6652                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6653                           | New vectors                   |
6654
6655      So that now all vectors are READY:
6656      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6657      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6658      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6659
6660    This algorithm is faster than one in vect_permute_load_chain if:
6661      1.  "shift of a concatination" is faster than general permutation.
6662          This is usually so.
6663      2.  The TARGET machine can't execute vector instructions in parallel.
6664          This is because each step of the algorithm depends on previous.
6665          The algorithm in vect_permute_load_chain is much more parallel.
6666
6667    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6668 */
6669
6670 static bool
6671 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6672                                unsigned int length,
6673                                stmt_vec_info stmt_info,
6674                                gimple_stmt_iterator *gsi,
6675                                vec<tree> *result_chain)
6676 {
6677   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6678   tree perm2_mask1, perm2_mask2, perm3_mask;
6679   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6680   gimple *perm_stmt;
6681
6682   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6683   machine_mode vmode = TYPE_MODE (vectype);
6684   unsigned int i;
6685   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6686
6687   unsigned HOST_WIDE_INT nelt, vf;
6688   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6689       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6690     /* Not supported for variable-length vectors.  */
6691     return false;
6692
6693   vec_perm_builder sel (nelt, nelt, 1);
6694   sel.quick_grow (nelt);
6695
6696   result_chain->quick_grow (length);
6697   memcpy (result_chain->address (), dr_chain.address (),
6698           length * sizeof (tree));
6699
6700   if (pow2p_hwi (length) && vf > 4)
6701     {
6702       unsigned int j, log_length = exact_log2 (length);
6703       for (i = 0; i < nelt / 2; ++i)
6704         sel[i] = i * 2;
6705       for (i = 0; i < nelt / 2; ++i)
6706         sel[nelt / 2 + i] = i * 2 + 1;
6707       vec_perm_indices indices (sel, 2, nelt);
6708       if (!can_vec_perm_const_p (vmode, vmode, indices))
6709         {
6710           if (dump_enabled_p ())
6711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6712                              "shuffle of 2 fields structure is not \
6713                               supported by target\n");
6714           return false;
6715         }
6716       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6717
6718       for (i = 0; i < nelt / 2; ++i)
6719         sel[i] = i * 2 + 1;
6720       for (i = 0; i < nelt / 2; ++i)
6721         sel[nelt / 2 + i] = i * 2;
6722       indices.new_vector (sel, 2, nelt);
6723       if (!can_vec_perm_const_p (vmode, vmode, indices))
6724         {
6725           if (dump_enabled_p ())
6726             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727                              "shuffle of 2 fields structure is not \
6728                               supported by target\n");
6729           return false;
6730         }
6731       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6732
6733       /* Generating permutation constant to shift all elements.
6734          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6735       for (i = 0; i < nelt; i++)
6736         sel[i] = nelt / 2 + i;
6737       indices.new_vector (sel, 2, nelt);
6738       if (!can_vec_perm_const_p (vmode, vmode, indices))
6739         {
6740           if (dump_enabled_p ())
6741             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6742                              "shift permutation is not supported by target\n");
6743           return false;
6744         }
6745       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6746
6747       /* Generating permutation constant to select vector from 2.
6748          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6749       for (i = 0; i < nelt / 2; i++)
6750         sel[i] = i;
6751       for (i = nelt / 2; i < nelt; i++)
6752         sel[i] = nelt + i;
6753       indices.new_vector (sel, 2, nelt);
6754       if (!can_vec_perm_const_p (vmode, vmode, indices))
6755         {
6756           if (dump_enabled_p ())
6757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758                              "select is not supported by target\n");
6759           return false;
6760         }
6761       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6762
6763       for (i = 0; i < log_length; i++)
6764         {
6765           for (j = 0; j < length; j += 2)
6766             {
6767               first_vect = dr_chain[j];
6768               second_vect = dr_chain[j + 1];
6769
6770               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6771               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6772                                                first_vect, first_vect,
6773                                                perm2_mask1);
6774               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6775               vect[0] = data_ref;
6776
6777               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6778               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6779                                                second_vect, second_vect,
6780                                                perm2_mask2);
6781               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6782               vect[1] = data_ref;
6783
6784               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6785               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6786                                                vect[0], vect[1], shift1_mask);
6787               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6788               (*result_chain)[j/2 + length/2] = data_ref;
6789
6790               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6791               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6792                                                vect[0], vect[1], select_mask);
6793               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6794               (*result_chain)[j/2] = data_ref;
6795             }
6796           memcpy (dr_chain.address (), result_chain->address (),
6797                   length * sizeof (tree));
6798         }
6799       return true;
6800     }
6801   if (length == 3 && vf > 2)
6802     {
6803       unsigned int k = 0, l = 0;
6804
6805       /* Generating permutation constant to get all elements in rigth order.
6806          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6807       for (i = 0; i < nelt; i++)
6808         {
6809           if (3 * k + (l % 3) >= nelt)
6810             {
6811               k = 0;
6812               l += (3 - (nelt % 3));
6813             }
6814           sel[i] = 3 * k + (l % 3);
6815           k++;
6816         }
6817       vec_perm_indices indices (sel, 2, nelt);
6818       if (!can_vec_perm_const_p (vmode, vmode, indices))
6819         {
6820           if (dump_enabled_p ())
6821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6822                              "shuffle of 3 fields structure is not \
6823                               supported by target\n");
6824           return false;
6825         }
6826       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6827
6828       /* Generating permutation constant to shift all elements.
6829          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6830       for (i = 0; i < nelt; i++)
6831         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6832       indices.new_vector (sel, 2, nelt);
6833       if (!can_vec_perm_const_p (vmode, vmode, indices))
6834         {
6835           if (dump_enabled_p ())
6836             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                              "shift permutation is not supported by target\n");
6838           return false;
6839         }
6840       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6841
6842       /* Generating permutation constant to shift all elements.
6843          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6844       for (i = 0; i < nelt; i++)
6845         sel[i] = 2 * (nelt / 3) + 1 + i;
6846       indices.new_vector (sel, 2, nelt);
6847       if (!can_vec_perm_const_p (vmode, vmode, indices))
6848         {
6849           if (dump_enabled_p ())
6850             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6851                              "shift permutation is not supported by target\n");
6852           return false;
6853         }
6854       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6855
6856       /* Generating permutation constant to shift all elements.
6857          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6858       for (i = 0; i < nelt; i++)
6859         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6860       indices.new_vector (sel, 2, nelt);
6861       if (!can_vec_perm_const_p (vmode, vmode, indices))
6862         {
6863           if (dump_enabled_p ())
6864             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6865                              "shift permutation is not supported by target\n");
6866           return false;
6867         }
6868       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6869
6870       /* Generating permutation constant to shift all elements.
6871          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6872       for (i = 0; i < nelt; i++)
6873         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6874       indices.new_vector (sel, 2, nelt);
6875       if (!can_vec_perm_const_p (vmode, vmode, indices))
6876         {
6877           if (dump_enabled_p ())
6878             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6879                              "shift permutation is not supported by target\n");
6880           return false;
6881         }
6882       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6883
6884       for (k = 0; k < 3; k++)
6885         {
6886           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6887           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6888                                            dr_chain[k], dr_chain[k],
6889                                            perm3_mask);
6890           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6891           vect[k] = data_ref;
6892         }
6893
6894       for (k = 0; k < 3; k++)
6895         {
6896           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6897           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6898                                            vect[k % 3], vect[(k + 1) % 3],
6899                                            shift1_mask);
6900           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6901           vect_shift[k] = data_ref;
6902         }
6903
6904       for (k = 0; k < 3; k++)
6905         {
6906           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6907           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6908                                            vect_shift[(4 - k) % 3],
6909                                            vect_shift[(3 - k) % 3],
6910                                            shift2_mask);
6911           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6912           vect[k] = data_ref;
6913         }
6914
6915       (*result_chain)[3 - (nelt % 3)] = vect[2];
6916
6917       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6918       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6919                                        vect[0], shift3_mask);
6920       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6921       (*result_chain)[nelt % 3] = data_ref;
6922
6923       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6924       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6925                                        vect[1], shift4_mask);
6926       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6927       (*result_chain)[0] = data_ref;
6928       return true;
6929     }
6930   return false;
6931 }
6932
6933 /* Function vect_transform_grouped_load.
6934
6935    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6936    to perform their permutation and ascribe the result vectorized statements to
6937    the scalar statements.
6938 */
6939
6940 void
6941 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6942                              vec<tree> dr_chain,
6943                              int size, gimple_stmt_iterator *gsi)
6944 {
6945   machine_mode mode;
6946   vec<tree> result_chain = vNULL;
6947
6948   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6949      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6950      vectors, that are ready for vector computation.  */
6951   result_chain.create (size);
6952
6953   /* If reassociation width for vector type is 2 or greater target machine can
6954      execute 2 or more vector instructions in parallel.  Otherwise try to
6955      get chain for loads group using vect_shift_permute_load_chain.  */
6956   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6957   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6958       || pow2p_hwi (size)
6959       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6960                                          gsi, &result_chain))
6961     vect_permute_load_chain (vinfo, dr_chain,
6962                              size, stmt_info, gsi, &result_chain);
6963   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6964   result_chain.release ();
6965 }
6966
6967 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6968    generated as part of the vectorization of STMT_INFO.  Assign the statement
6969    for each vector to the associated scalar statement.  */
6970
6971 void
6972 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6973                                   vec<tree> result_chain)
6974 {
6975   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6976   unsigned int i, gap_count;
6977   tree tmp_data_ref;
6978
6979   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6980      Since we scan the chain starting from it's first node, their order
6981      corresponds the order of data-refs in RESULT_CHAIN.  */
6982   stmt_vec_info next_stmt_info = first_stmt_info;
6983   gap_count = 1;
6984   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6985     {
6986       if (!next_stmt_info)
6987         break;
6988
6989       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6990        code elimination pass later.  No need to check for the first stmt in
6991        the group, since it always exists.
6992        DR_GROUP_GAP is the number of steps in elements from the previous
6993        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6994        correspond to the gaps.  */
6995       if (next_stmt_info != first_stmt_info
6996           && gap_count < DR_GROUP_GAP (next_stmt_info))
6997         {
6998           gap_count++;
6999           continue;
7000         }
7001
7002       /* ???  The following needs cleanup after the removal of
7003          DR_GROUP_SAME_DR_STMT.  */
7004       if (next_stmt_info)
7005         {
7006           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7007           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7008              copies, and we put the new vector statement last.  */
7009           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7010
7011           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7012           gap_count = 1;
7013         }
7014     }
7015 }
7016
7017 /* Function vect_force_dr_alignment_p.
7018
7019    Returns whether the alignment of a DECL can be forced to be aligned
7020    on ALIGNMENT bit boundary.  */
7021
7022 bool
7023 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7024 {
7025   if (!VAR_P (decl))
7026     return false;
7027
7028   if (decl_in_symtab_p (decl)
7029       && !symtab_node::get (decl)->can_increase_alignment_p ())
7030     return false;
7031
7032   if (TREE_STATIC (decl))
7033     return (known_le (alignment,
7034                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7035   else
7036     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7037 }
7038
7039 /* Return whether the data reference DR_INFO is supported with respect to its
7040    alignment.
7041    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7042    it is aligned, i.e., check if it is possible to vectorize it with different
7043    alignment.  */
7044
7045 enum dr_alignment_support
7046 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7047                                tree vectype, int misalignment)
7048 {
7049   data_reference *dr = dr_info->dr;
7050   stmt_vec_info stmt_info = dr_info->stmt;
7051   machine_mode mode = TYPE_MODE (vectype);
7052   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7053   class loop *vect_loop = NULL;
7054   bool nested_in_vect_loop = false;
7055
7056   if (misalignment == 0)
7057     return dr_aligned;
7058
7059   /* For now assume all conditional loads/stores support unaligned
7060      access without any special code.  */
7061   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7062     if (gimple_call_internal_p (stmt)
7063         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7064             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7065       return dr_unaligned_supported;
7066
7067   if (loop_vinfo)
7068     {
7069       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7070       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7071     }
7072
7073   /* Possibly unaligned access.  */
7074
7075   /* We can choose between using the implicit realignment scheme (generating
7076      a misaligned_move stmt) and the explicit realignment scheme (generating
7077      aligned loads with a REALIGN_LOAD).  There are two variants to the
7078      explicit realignment scheme: optimized, and unoptimized.
7079      We can optimize the realignment only if the step between consecutive
7080      vector loads is equal to the vector size.  Since the vector memory
7081      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7082      is guaranteed that the misalignment amount remains the same throughout the
7083      execution of the vectorized loop.  Therefore, we can create the
7084      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7085      at the loop preheader.
7086
7087      However, in the case of outer-loop vectorization, when vectorizing a
7088      memory access in the inner-loop nested within the LOOP that is now being
7089      vectorized, while it is guaranteed that the misalignment of the
7090      vectorized memory access will remain the same in different outer-loop
7091      iterations, it is *not* guaranteed that is will remain the same throughout
7092      the execution of the inner-loop.  This is because the inner-loop advances
7093      with the original scalar step (and not in steps of VS).  If the inner-loop
7094      step happens to be a multiple of VS, then the misalignment remains fixed
7095      and we can use the optimized realignment scheme.  For example:
7096
7097       for (i=0; i<N; i++)
7098         for (j=0; j<M; j++)
7099           s += a[i+j];
7100
7101      When vectorizing the i-loop in the above example, the step between
7102      consecutive vector loads is 1, and so the misalignment does not remain
7103      fixed across the execution of the inner-loop, and the realignment cannot
7104      be optimized (as illustrated in the following pseudo vectorized loop):
7105
7106       for (i=0; i<N; i+=4)
7107         for (j=0; j<M; j++){
7108           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7109                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7110                          // (assuming that we start from an aligned address).
7111           }
7112
7113      We therefore have to use the unoptimized realignment scheme:
7114
7115       for (i=0; i<N; i+=4)
7116           for (j=k; j<M; j+=4)
7117           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7118                            // that the misalignment of the initial address is
7119                            // 0).
7120
7121      The loop can then be vectorized as follows:
7122
7123       for (k=0; k<4; k++){
7124         rt = get_realignment_token (&vp[k]);
7125         for (i=0; i<N; i+=4){
7126           v1 = vp[i+k];
7127           for (j=k; j<M; j+=4){
7128             v2 = vp[i+j+VS-1];
7129             va = REALIGN_LOAD <v1,v2,rt>;
7130             vs += va;
7131             v1 = v2;
7132           }
7133         }
7134     } */
7135
7136   if (DR_IS_READ (dr))
7137     {
7138       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7139           && (!targetm.vectorize.builtin_mask_for_load
7140               || targetm.vectorize.builtin_mask_for_load ()))
7141         {
7142           /* If we are doing SLP then the accesses need not have the
7143              same alignment, instead it depends on the SLP group size.  */
7144           if (loop_vinfo
7145               && STMT_SLP_TYPE (stmt_info)
7146               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7147                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7148                                   * (DR_GROUP_SIZE
7149                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7150                                   TYPE_VECTOR_SUBPARTS (vectype))))
7151             ;
7152           else if (!loop_vinfo
7153                    || (nested_in_vect_loop
7154                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7155                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7156             return dr_explicit_realign;
7157           else
7158             return dr_explicit_realign_optimized;
7159         }
7160     }
7161
7162   bool is_packed = false;
7163   tree type = TREE_TYPE (DR_REF (dr));
7164   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7165     is_packed = not_size_aligned (DR_REF (dr));
7166   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7167                                                      is_packed))
7168     return dr_unaligned_supported;
7169
7170   /* Unsupported.  */
7171   return dr_unaligned_unsupported;
7172 }