gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* If there is a loop invariant read involved we might vectorize it in
 286      the prologue, breaking scalar oder with respect to the in-loop store.  */
 287   if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
 288       || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
 289     return false;
 290
 291   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 292      emitted at the position of the first scalar load.
 293      Stores in a group are emitted at the position of the last scalar store.
 294      Compute that position and check whether the resulting order matches
 295      the current one.  */
 296   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 297   if (il_a)
 298     {
 299       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 300         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 301              s = DR_GROUP_NEXT_ELEMENT (s))
 302           il_a = get_later_stmt (il_a, s);
 303       else /* DR_IS_READ */
 304         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 305              s = DR_GROUP_NEXT_ELEMENT (s))
 306           if (get_later_stmt (il_a, s) == il_a)
 307             il_a = s;
 308     }
 309   else
 310     il_a = stmtinfo_a;
 311   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 312   if (il_b)
 313     {
 314       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 315         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 316              s = DR_GROUP_NEXT_ELEMENT (s))
 317           il_b = get_later_stmt (il_b, s);
 318       else /* DR_IS_READ */
 319         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 320              s = DR_GROUP_NEXT_ELEMENT (s))
 321           if (get_later_stmt (il_b, s) == il_b)
 322             il_b = s;
 323     }
 324   else
 325     il_b = stmtinfo_b;
 326   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 327   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 328 }
 329
 330 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 331    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 332    distances.  These distances are conservatively correct but they don't
 333    reflect a guaranteed dependence.
 334
 335    Return true if this function does all the work necessary to avoid
 336    an alias or false if the caller should use the dependence distances
 337    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 338    the depth of the loop described by LOOP_VINFO and the other arguments
 339    are as for vect_analyze_data_ref_dependence.  */
 340
 341 static bool
 342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 343                                        loop_vec_info loop_vinfo,
 344                                        int loop_depth, unsigned int *max_vf)
 345 {
 346   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 347   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 348     {
 349       int dist = dist_v[loop_depth];
 350       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 351         {
 352           /* If the user asserted safelen >= DIST consecutive iterations
 353              can be executed concurrently, assume independence.
 354
 355              ??? An alternative would be to add the alias check even
 356              in this case, and vectorize the fallback loop with the
 357              maximum VF set to safelen.  However, if the user has
 358              explicitly given a length, it's less likely that that
 359              would be a win.  */
 360           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 361             {
 362               if ((unsigned int) loop->safelen < *max_vf)
 363                 *max_vf = loop->safelen;
 364               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 365               continue;
 366             }
 367
 368           /* For dependence distances of 2 or more, we have the option
 369              of limiting VF or checking for an alias at runtime.
 370              Prefer to check at runtime if we can, to avoid limiting
 371              the VF unnecessarily when the bases are in fact independent.
 372
 373              Note that the alias checks will be removed if the VF ends up
 374              being small enough.  */
 375           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 376           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 377           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 378                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 379                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 380         }
 381     }
 382   return true;
 383 }
 384
 385
 386 /* Function vect_analyze_data_ref_dependence.
 387
 388    FIXME: I needed to change the sense of the returned flag.
 389
 390    Return FALSE if there (might) exist a dependence between a memory-reference
 391    DRA and a memory-reference DRB.  When versioning for alias may check a
 392    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 393    the data dependence.  */
 394
 395 static opt_result
 396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 397                                   loop_vec_info loop_vinfo,
 398                                   unsigned int *max_vf)
 399 {
 400   unsigned int i;
 401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 402   struct data_reference *dra = DDR_A (ddr);
 403   struct data_reference *drb = DDR_B (ddr);
 404   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 405   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 406   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 407   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 408   lambda_vector dist_v;
 409   unsigned int loop_depth;
 410
 411   /* If user asserted safelen consecutive iterations can be
 412      executed concurrently, assume independence.  */
 413   auto apply_safelen = [&]()
 414     {
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return true;
 421         }
 422       return false;
 423     };
 424
 425   /* In loop analysis all data references should be vectorizable.  */
 426   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 427       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 428     gcc_unreachable ();
 429
 430   /* Independent data accesses.  */
 431   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 432     return opt_result::success ();
 433
 434   if (dra == drb
 435       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 436     return opt_result::success ();
 437
 438   /* We do not have to consider dependences between accesses that belong
 439      to the same group, unless the stride could be smaller than the
 440      group size.  */
 441   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 442       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 443           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 444       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 445     return opt_result::success ();
 446
 447   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 448      least two scalar iterations, there is always also a true dependence.
 449      As the vectorizer does not re-order loads and stores we can ignore
 450      the anti-dependence if TBAA can disambiguate both DRs similar to the
 451      case with known negative distance anti-dependences (positive
 452      distance anti-dependences would violate TBAA constraints).  */
 453   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 454        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 455       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 456                                  get_alias_set (DR_REF (drb))))
 457     return opt_result::success ();
 458
 459   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 460       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 461     {
 462       if (apply_safelen ())
 463         return opt_result::success ();
 464
 465       return opt_result::failure_at
 466         (stmtinfo_a->stmt,
 467          "possible alias involving gather/scatter between %T and %T\n",
 468          DR_REF (dra), DR_REF (drb));
 469     }
 470
 471   /* Unknown data dependence.  */
 472   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 473     {
 474       if (apply_safelen ())
 475         return opt_result::success ();
 476
 477       if (dump_enabled_p ())
 478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 479                          "versioning for alias required: "
 480                          "can't determine dependence between %T and %T\n",
 481                          DR_REF (dra), DR_REF (drb));
 482
 483       /* Add to list of ddrs that need to be tested at run-time.  */
 484       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 485     }
 486
 487   /* Known data dependence.  */
 488   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 489     {
 490       if (apply_safelen ())
 491         return opt_result::success ();
 492
 493       if (dump_enabled_p ())
 494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 495                          "versioning for alias required: "
 496                          "bad dist vector for %T and %T\n",
 497                          DR_REF (dra), DR_REF (drb));
 498       /* Add to list of ddrs that need to be tested at run-time.  */
 499       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 500     }
 501
 502   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 503
 504   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 505       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 506                                                 loop_depth, max_vf))
 507     return opt_result::success ();
 508
 509   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 510     {
 511       int dist = dist_v[loop_depth];
 512
 513       if (dump_enabled_p ())
 514         dump_printf_loc (MSG_NOTE, vect_location,
 515                          "dependence distance  = %d.\n", dist);
 516
 517       if (dist == 0)
 518         {
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance == 0 between %T and %T\n",
 522                              DR_REF (dra), DR_REF (drb));
 523
 524           /* When we perform grouped accesses and perform implicit CSE
 525              by detecting equal accesses and doing disambiguation with
 526              runtime alias tests like for
 527                 .. = a[i];
 528                 .. = a[i+1];
 529                 a[i] = ..;
 530                 a[i+1] = ..;
 531                 *p = ..;
 532                 .. = a[i];
 533                 .. = a[i+1];
 534              where we will end up loading { a[i], a[i+1] } once, make
 535              sure that inserting group loads before the first load and
 536              stores after the last store will do the right thing.
 537              Similar for groups like
 538                 a[i] = ...;
 539                 ... = a[i];
 540                 a[i+1] = ...;
 541              where loads from the group interleave with the store.  */
 542           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 543             return opt_result::failure_at (stmtinfo_a->stmt,
 544                                            "READ_WRITE dependence"
 545                                            " in interleaving.\n");
 546
 547           if (loop->safelen < 2)
 548             {
 549               tree indicator = dr_zero_step_indicator (dra);
 550               if (!indicator || integer_zerop (indicator))
 551                 return opt_result::failure_at (stmtinfo_a->stmt,
 552                                                "access also has a zero step\n");
 553               else if (TREE_CODE (indicator) != INTEGER_CST)
 554                 vect_check_nonzero_value (loop_vinfo, indicator);
 555             }
 556           continue;
 557         }
 558
 559       if (dist > 0 && DDR_REVERSED_P (ddr))
 560         {
 561           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 562              reversed (to make distance vector positive), and the actual
 563              distance is negative.  */
 564           if (dump_enabled_p ())
 565             dump_printf_loc (MSG_NOTE, vect_location,
 566                              "dependence distance negative.\n");
 567           /* When doing outer loop vectorization, we need to check if there is
 568              a backward dependence at the inner loop level if the dependence
 569              at the outer loop is reversed.  See PR81740.  */
 570           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 571               || nested_in_vect_loop_p (loop, stmtinfo_b))
 572             {
 573               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 574                                                          DDR_LOOP_NEST (ddr));
 575               if (dist_v[inner_depth] < 0)
 576                 return opt_result::failure_at (stmtinfo_a->stmt,
 577                                                "not vectorized, dependence "
 578                                                "between data-refs %T and %T\n",
 579                                                DR_REF (dra), DR_REF (drb));
 580             }
 581           /* Record a negative dependence distance to later limit the
 582              amount of stmt copying / unrolling we can perform.
 583              Only need to handle read-after-write dependence.  */
 584           if (DR_IS_READ (drb)
 585               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 586                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 587             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 588           continue;
 589         }
 590
 591       unsigned int abs_dist = abs (dist);
 592       if (abs_dist >= 2 && abs_dist < *max_vf)
 593         {
 594           /* The dependence distance requires reduction of the maximal
 595              vectorization factor.  */
 596           *max_vf = abs_dist;
 597           if (dump_enabled_p ())
 598             dump_printf_loc (MSG_NOTE, vect_location,
 599                              "adjusting maximal vectorization factor to %i\n",
 600                              *max_vf);
 601         }
 602
 603       if (abs_dist >= *max_vf)
 604         {
 605           /* Dependence distance does not create dependence, as far as
 606              vectorization is concerned, in this case.  */
 607           if (dump_enabled_p ())
 608             dump_printf_loc (MSG_NOTE, vect_location,
 609                              "dependence distance >= VF.\n");
 610           continue;
 611         }
 612
 613       return opt_result::failure_at (stmtinfo_a->stmt,
 614                                      "not vectorized, possible dependence "
 615                                      "between data-refs %T and %T\n",
 616                                      DR_REF (dra), DR_REF (drb));
 617     }
 618
 619   return opt_result::success ();
 620 }
 621
 622 /* Function vect_analyze_early_break_dependences.
 623
 624    Examine all the data references in the loop and make sure that if we have
 625    multiple exits that we are able to safely move stores such that they become
 626    safe for vectorization.  The function also calculates the place where to move
 627    the instructions to and computes what the new vUSE chain should be.
 628
 629    This works in tandem with the CFG that will be produced by
 630    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 631
 632    This function tries to validate whether an early break vectorization
 633    is possible for the current instruction sequence. Returns True i
 634    possible, otherwise False.
 635
 636    Requirements:
 637      - Any memory access must be to a fixed size buffer.
 638      - There must not be any loads and stores to the same object.
 639      - Multiple loads are allowed as long as they don't alias.
 640
 641    NOTE:
 642      This implementation is very conservative. Any overlapping loads/stores
 643      that take place before the early break statement gets rejected aside from
 644      WAR dependencies.
 645
 646      i.e.:
 647
 648         a[i] = 8
 649         c = a[i]
 650         if (b[i])
 651           ...
 652
 653         is not allowed, but
 654
 655         c = a[i]
 656         a[i] = 8
 657         if (b[i])
 658           ...
 659
 660         is which is the common case.  */
 661
 662 static opt_result
 663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 664 {
 665   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 666
 667   /* List of all load data references found during traversal.  */
 668   auto_vec<data_reference *> bases;
 669   basic_block dest_bb = NULL;
 670
 671   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 672   class loop *loop_nest = loop_outer (loop);
 673
 674   if (dump_enabled_p ())
 675     dump_printf_loc (MSG_NOTE, vect_location,
 676                      "loop contains multiple exits, analyzing"
 677                      " statement dependencies.\n");
 678
 679   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 680     if (dump_enabled_p ())
 681       dump_printf_loc (MSG_NOTE, vect_location,
 682                        "alternate exit has been chosen as main exit.\n");
 683
 684   /* Since we don't support general control flow, the location we'll move the
 685      side-effects to is always the latch connected exit.  When we support
 686      general control flow we can do better but for now this is fine.  Move
 687      side-effects to the in-loop destination of the last early exit.  For the PEELED
 688      case we move the side-effects to the latch block as this is guaranteed to be the
 689      last block to be executed when a vector iteration finished.  */
 690   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 691     dest_bb = loop->latch;
 692   else
 693     dest_bb = single_pred (loop->latch);
 694
 695   /* We start looking from dest_bb, for the non-PEELED case we don't want to
 696      move any stores already present, but we do want to read and validate the
 697      loads.  */
 698   basic_block bb = dest_bb;
 699
 700   /* In the peeled case we need to check all the loads in the loop since to move the
 701      the stores we lift the stores over all loads into the latch.  */
 702   bool check_deps = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
 703
 704   do
 705     {
 706       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 707
 708       /* Now analyze all the remaining statements and try to determine which
 709          instructions are allowed/needed to be moved.  */
 710       while (!gsi_end_p (gsi))
 711         {
 712           gimple *stmt = gsi_stmt (gsi);
 713           gsi_prev (&gsi);
 714           if (!gimple_has_ops (stmt)
 715               || is_gimple_debug (stmt))
 716             continue;
 717
 718           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 719           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 720           if (!dr_ref)
 721             continue;
 722
 723           /* Check if vector accesses to the object will be within bounds.
 724              must be a constant or assume loop will be versioned or niters
 725              bounded by VF so accesses are within range.  We only need to check the
 726              reads since writes are moved to a safe place where if we get there we
 727              know they are safe to perform.  */
 728           if (DR_IS_READ (dr_ref)
 729               && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
 730             {
 731               if (dump_enabled_p ())
 732                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 733                                  "early breaks not supported: vectorization "
 734                                  "would %s beyond size of obj.",
 735                                  DR_IS_READ (dr_ref) ? "read" : "write");
 736               return opt_result::failure_at (stmt,
 737                                  "can't safely apply code motion to "
 738                                  "dependencies of %G to vectorize "
 739                                  "the early exit.\n", stmt);
 740             }
 741
 742           if (!check_deps)
 743             continue;
 744
 745           if (DR_IS_READ (dr_ref))
 746             bases.safe_push (dr_ref);
 747           else if (DR_IS_WRITE (dr_ref))
 748             {
 749               /* We are moving writes down in the CFG.  To be sure that this
 750                  is valid after vectorization we have to check all the loads
 751                  we are sinking the stores past to see if any of them may
 752                  alias or are the same object.
 753
 754                  Same objects will not be an issue because unless the store
 755                  is marked volatile the value can be forwarded.  If the
 756                  store is marked volatile we don't vectorize the loop
 757                  anyway.
 758
 759                  That leaves the check for aliasing.  We don't really need
 760                  to care about the stores aliasing with each other since the
 761                  stores are moved in order so the effects are still observed
 762                  correctly.  This leaves the check for WAR dependencies
 763                  which we would be introducing here if the DR can alias.
 764                  The check is quadratic in loads/stores but I have not found
 765                  a better API to do this.  I believe all loads and stores
 766                  must be checked.  We also must check them when we
 767                  encountered the store, since we don't care about loads past
 768                  the store.  */
 769
 770               for (auto dr_read : bases)
 771                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 772                   {
 773                     if (dump_enabled_p ())
 774                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 775                                        vect_location,
 776                                        "early breaks not supported: "
 777                                        "overlapping loads and stores "
 778                                        "found before the break "
 779                                        "statement.\n");
 780
 781                     return opt_result::failure_at (stmt,
 782                              "can't safely apply code motion to dependencies"
 783                              " to vectorize the early exit. %G may alias with"
 784                              " %G\n", stmt, dr_read->stmt);
 785                   }
 786             }
 787
 788           if (gimple_vdef (stmt))
 789             {
 790               if (dump_enabled_p ())
 791                 dump_printf_loc (MSG_NOTE, vect_location,
 792                                  "==> recording stmt %G", stmt);
 793
 794               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 795             }
 796           else if (gimple_vuse (stmt))
 797             {
 798               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 799               if (dump_enabled_p ())
 800                 dump_printf_loc (MSG_NOTE, vect_location,
 801                                  "marked statement for vUSE update: %G", stmt);
 802             }
 803         }
 804
 805       if (!single_pred_p (bb))
 806         {
 807           gcc_assert (bb == loop->header);
 808           break;
 809         }
 810
 811       /* For the non-PEELED case we don't want to check the loads in the IV exit block
 812          for dependencies with the stores, but any block preceeding it we do.  */
 813       check_deps = true;
 814       bb = single_pred (bb);
 815     }
 816   while (1);
 817
 818   /* We don't allow outer -> inner loop transitions which should have been
 819      trapped already during loop form analysis.  */
 820   gcc_assert (dest_bb->loop_father == loop);
 821
 822   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 823
 824   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 825     {
 826       /* All uses shall be updated to that of the first load.  Entries are
 827          stored in reverse order.  */
 828       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 829       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 830         {
 831           if (dump_enabled_p ())
 832           dump_printf_loc (MSG_NOTE, vect_location,
 833                            "will update use: %T, mem_ref: %G", vuse, g);
 834         }
 835     }
 836
 837   if (dump_enabled_p ())
 838     dump_printf_loc (MSG_NOTE, vect_location,
 839                      "recorded statements to be moved to BB %d\n",
 840                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 841
 842   return opt_result::success ();
 843 }
 844
 845 /* Function vect_analyze_data_ref_dependences.
 846
 847    Examine all the data references in the loop, and make sure there do not
 848    exist any data dependences between them.  Set *MAX_VF according to
 849    the maximum vectorization factor the data dependences allow.  */
 850
 851 opt_result
 852 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 853                                    unsigned int *max_vf)
 854 {
 855   unsigned int i;
 856   struct data_dependence_relation *ddr;
 857
 858   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 859
 860   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 861     {
 862       LOOP_VINFO_DDRS (loop_vinfo)
 863         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 864                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 865       /* We do not need read-read dependences.  */
 866       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 867                                           &LOOP_VINFO_DDRS (loop_vinfo),
 868                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 869                                           false);
 870       gcc_assert (res);
 871     }
 872
 873   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 874
 875   /* For epilogues we either have no aliases or alias versioning
 876      was applied to original loop.  Therefore we may just get max_vf
 877      using VF of original loop.  */
 878   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 879     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 880   else
 881     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 882       {
 883         opt_result res
 884           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 885         if (!res)
 886           return res;
 887       }
 888
 889   /* If we have early break statements in the loop, check to see if they
 890      are of a form we can vectorizer.  */
 891   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 892     return vect_analyze_early_break_dependences (loop_vinfo);
 893
 894   return opt_result::success ();
 895 }
 896
 897
 898 /* Function vect_slp_analyze_data_ref_dependence.
 899
 900    Return TRUE if there (might) exist a dependence between a memory-reference
 901    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 902    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 903    according to the data dependence.  */
 904
 905 static bool
 906 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 907                                       struct data_dependence_relation *ddr)
 908 {
 909   struct data_reference *dra = DDR_A (ddr);
 910   struct data_reference *drb = DDR_B (ddr);
 911   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 912   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 913
 914   /* We need to check dependences of statements marked as unvectorizable
 915      as well, they still can prohibit vectorization.  */
 916
 917   /* Independent data accesses.  */
 918   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 919     return false;
 920
 921   if (dra == drb)
 922     return false;
 923
 924   /* Read-read is OK.  */
 925   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 926     return false;
 927
 928   /* If dra and drb are part of the same interleaving chain consider
 929      them independent.  */
 930   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 931       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 932           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 933     return false;
 934
 935   /* Unknown data dependence.  */
 936   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 937     {
 938       if  (dump_enabled_p ())
 939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 940                          "can't determine dependence between %T and %T\n",
 941                          DR_REF (dra), DR_REF (drb));
 942     }
 943   else if (dump_enabled_p ())
 944     dump_printf_loc (MSG_NOTE, vect_location,
 945                      "determined dependence between %T and %T\n",
 946                      DR_REF (dra), DR_REF (drb));
 947
 948   return true;
 949 }
 950
 951
 952 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 953
 954 static bool
 955 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 956 {
 957   /* This walks over all stmts involved in the SLP store done
 958      in NODE verifying we can sink them up to the last stmt in the
 959      group.  */
 960   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 961   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 962
 963   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 964     {
 965       stmt_vec_info access_info
 966         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 967       if (access_info == last_access_info)
 968         continue;
 969       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 970       ao_ref ref;
 971       bool ref_initialized_p = false;
 972       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 973            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 974         {
 975           gimple *stmt = gsi_stmt (gsi);
 976           if (! gimple_vuse (stmt))
 977             continue;
 978
 979           /* If we couldn't record a (single) data reference for this
 980              stmt we have to resort to the alias oracle.  */
 981           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 982           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 983           if (!dr_b)
 984             {
 985               /* We are moving a store - this means
 986                  we cannot use TBAA for disambiguation.  */
 987               if (!ref_initialized_p)
 988                 ao_ref_init (&ref, DR_REF (dr_a));
 989               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 990                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 991                 return false;
 992               continue;
 993             }
 994
 995           gcc_assert (!gimple_visited_p (stmt));
 996
 997           ddr_p ddr = initialize_data_dependence_relation (dr_a,
 998                                                            dr_b, vNULL);
 999           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1000           free_dependence_relation (ddr);
1001           if (dependent)
1002             return false;
1003         }
1004     }
1005   return true;
1006 }
1007
1008 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1009    contain the vector of scalar stores of this instance if we are
1010    disambiguating the loads.  */
1011
1012 static bool
1013 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1014                                    vec<stmt_vec_info> stores,
1015                                    stmt_vec_info last_store_info)
1016 {
1017   /* This walks over all stmts involved in the SLP load done
1018      in NODE verifying we can hoist them up to the first stmt in the
1019      group.  */
1020   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1021   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1022
1023   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1024     {
1025       stmt_vec_info access_info
1026         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1027       if (access_info == first_access_info)
1028         continue;
1029       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1030       ao_ref ref;
1031       bool ref_initialized_p = false;
1032       hash_set<stmt_vec_info> grp_visited;
1033       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1034            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1035         {
1036           gimple *stmt = gsi_stmt (gsi);
1037           if (! gimple_vdef (stmt))
1038             continue;
1039
1040           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1041
1042           /* If we run into a store of this same instance (we've just
1043              marked those) then delay dependence checking until we run
1044              into the last store because this is where it will have
1045              been sunk to (and we verified that we can do that already).  */
1046           if (gimple_visited_p (stmt))
1047             {
1048               if (stmt_info != last_store_info)
1049                 continue;
1050
1051               for (stmt_vec_info &store_info : stores)
1052                 {
1053                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1054                   ddr_p ddr = initialize_data_dependence_relation
1055                                 (dr_a, store_dr, vNULL);
1056                   bool dependent
1057                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1058                   free_dependence_relation (ddr);
1059                   if (dependent)
1060                     return false;
1061                 }
1062               continue;
1063             }
1064
1065           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1066             {
1067               /* We are hoisting a load - this means we can use TBAA for
1068                  disambiguation.  */
1069               if (!ref_initialized_p)
1070                 ao_ref_init (&ref, DR_REF (dr_a));
1071               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1072                 {
1073                   /* If we couldn't record a (single) data reference for this
1074                      stmt we have to give up now.  */
1075                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1076                   if (!dr_b)
1077                     return false;
1078                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1079                                                                    dr_b, vNULL);
1080                   bool dependent
1081                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1082                   free_dependence_relation (ddr);
1083                   if (dependent)
1084                     return false;
1085                 }
1086               /* No dependence.  */
1087               return true;
1088             };
1089           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1090             {
1091               /* When we run into a store group we have to honor
1092                  that earlier stores might be moved here.  We don't
1093                  know exactly which and where to since we lack a
1094                  back-mapping from DR to SLP node, so assume all
1095                  earlier stores are sunk here.  It's enough to
1096                  consider the last stmt of a group for this.
1097                  ???  Both this and the fact that we disregard that
1098                  the conflicting instance might be removed later
1099                  is overly conservative.  */
1100               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1101                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1102                      store_info != NULL;
1103                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1104                   if ((store_info == stmt_info
1105                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1106                       && !check_hoist (store_info))
1107                     return false;
1108             }
1109           else
1110             {
1111               if (!check_hoist (stmt_info))
1112                 return false;
1113             }
1114         }
1115     }
1116   return true;
1117 }
1118
1119
1120 /* Function vect_analyze_data_ref_dependences.
1121
1122    Examine all the data references in the basic-block, and make sure there
1123    do not exist any data dependences between them.  Set *MAX_VF according to
1124    the maximum vectorization factor the data dependences allow.  */
1125
1126 bool
1127 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1128 {
1129   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1130
1131   /* The stores of this instance are at the root of the SLP tree.  */
1132   slp_tree store = NULL;
1133   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1134     store = SLP_INSTANCE_TREE (instance);
1135
1136   /* Verify we can sink stores to the vectorized stmt insert location.  */
1137   stmt_vec_info last_store_info = NULL;
1138   if (store)
1139     {
1140       if (! vect_slp_analyze_store_dependences (vinfo, store))
1141         return false;
1142
1143       /* Mark stores in this instance and remember the last one.  */
1144       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1145       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1146         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1147     }
1148
1149   bool res = true;
1150
1151   /* Verify we can sink loads to the vectorized stmt insert location,
1152      special-casing stores of this instance.  */
1153   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1154     if (! vect_slp_analyze_load_dependences (vinfo, load,
1155                                              store
1156                                              ? SLP_TREE_SCALAR_STMTS (store)
1157                                              : vNULL, last_store_info))
1158       {
1159         res = false;
1160         break;
1161       }
1162
1163   /* Unset the visited flag.  */
1164   if (store)
1165     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1166       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1167
1168   return res;
1169 }
1170
1171 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1172    applied.  */
1173
1174 int
1175 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1176 {
1177   HOST_WIDE_INT diff = 0;
1178   /* Alignment is only analyzed for the first element of a DR group,
1179      use that but adjust misalignment by the offset of the access.  */
1180   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1181     {
1182       dr_vec_info *first_dr
1183         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1184       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1185          INTEGER_CSTs and the first element in the group has the lowest
1186          address.  */
1187       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1188               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1189       gcc_assert (diff >= 0);
1190       dr_info = first_dr;
1191     }
1192
1193   int misalign = dr_info->misalignment;
1194   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1195   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1196     return misalign;
1197
1198   /* If the access is only aligned for a vector type with smaller alignment
1199      requirement the access has unknown misalignment.  */
1200   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1201                 targetm.vectorize.preferred_vector_alignment (vectype)))
1202     return DR_MISALIGNMENT_UNKNOWN;
1203
1204   /* Apply the offset from the DR group start and the externally supplied
1205      offset which can for example result from a negative stride access.  */
1206   poly_int64 misalignment = misalign + diff + offset;
1207
1208   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1209      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1210   unsigned HOST_WIDE_INT target_alignment_c
1211     = dr_info->target_alignment.to_constant ();
1212   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1213     return DR_MISALIGNMENT_UNKNOWN;
1214   return misalign;
1215 }
1216
1217 /* Record the base alignment guarantee given by DRB, which occurs
1218    in STMT_INFO.  */
1219
1220 static void
1221 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1222                             innermost_loop_behavior *drb)
1223 {
1224   bool existed;
1225   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1226     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1227   if (!existed || entry.second->base_alignment < drb->base_alignment)
1228     {
1229       entry = std::make_pair (stmt_info, drb);
1230       if (dump_enabled_p ())
1231         dump_printf_loc (MSG_NOTE, vect_location,
1232                          "recording new base alignment for %T\n"
1233                          "  alignment:    %d\n"
1234                          "  misalignment: %d\n"
1235                          "  based on:     %G",
1236                          drb->base_address,
1237                          drb->base_alignment,
1238                          drb->base_misalignment,
1239                          stmt_info->stmt);
1240     }
1241 }
1242
1243 /* If the region we're going to vectorize is reached, all unconditional
1244    data references occur at least once.  We can therefore pool the base
1245    alignment guarantees from each unconditional reference.  Do this by
1246    going through all the data references in VINFO and checking whether
1247    the containing statement makes the reference unconditionally.  If so,
1248    record the alignment of the base address in VINFO so that it can be
1249    used for all other references with the same base.  */
1250
1251 void
1252 vect_record_base_alignments (vec_info *vinfo)
1253 {
1254   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1255   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1256   for (data_reference *dr : vinfo->shared->datarefs)
1257     {
1258       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1259       stmt_vec_info stmt_info = dr_info->stmt;
1260       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1261           && STMT_VINFO_VECTORIZABLE (stmt_info)
1262           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1263         {
1264           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1265
1266           /* If DR is nested in the loop that is being vectorized, we can also
1267              record the alignment of the base wrt the outer loop.  */
1268           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1269             vect_record_base_alignment
1270               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1271         }
1272     }
1273 }
1274
1275 /* Function vect_compute_data_ref_alignment
1276
1277    Compute the misalignment of the data reference DR_INFO when vectorizing
1278    with VECTYPE.
1279
1280    Output:
1281    1. initialized misalignment info for DR_INFO
1282
1283    FOR NOW: No analysis is actually performed. Misalignment is calculated
1284    only for trivial cases. TODO.  */
1285
1286 static void
1287 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1288                                  tree vectype)
1289 {
1290   stmt_vec_info stmt_info = dr_info->stmt;
1291   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1292   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1293   class loop *loop = NULL;
1294   tree ref = DR_REF (dr_info->dr);
1295
1296   if (dump_enabled_p ())
1297     dump_printf_loc (MSG_NOTE, vect_location,
1298                      "vect_compute_data_ref_alignment:\n");
1299
1300   if (loop_vinfo)
1301     loop = LOOP_VINFO_LOOP (loop_vinfo);
1302
1303   /* Initialize misalignment to unknown.  */
1304   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1305
1306   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1307     return;
1308
1309   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1310   bool step_preserves_misalignment_p;
1311
1312   poly_uint64 vector_alignment
1313     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1314                  BITS_PER_UNIT);
1315   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1316
1317   /* If the main loop has peeled for alignment we have no way of knowing
1318      whether the data accesses in the epilogues are aligned.  We can't at
1319      compile time answer the question whether we have entered the main loop or
1320      not.  Fixes PR 92351.  */
1321   if (loop_vinfo)
1322     {
1323       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1324       if (orig_loop_vinfo
1325           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1326         return;
1327     }
1328
1329   unsigned HOST_WIDE_INT vect_align_c;
1330   if (!vector_alignment.is_constant (&vect_align_c))
1331     return;
1332
1333   /* No step for BB vectorization.  */
1334   if (!loop)
1335     {
1336       gcc_assert (integer_zerop (drb->step));
1337       step_preserves_misalignment_p = true;
1338     }
1339
1340   /* In case the dataref is in an inner-loop of the loop that is being
1341      vectorized (LOOP), we use the base and misalignment information
1342      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1343      stays the same throughout the execution of the inner-loop, which is why
1344      we have to check that the stride of the dataref in the inner-loop evenly
1345      divides by the vector alignment.  */
1346   else if (nested_in_vect_loop_p (loop, stmt_info))
1347     {
1348       step_preserves_misalignment_p
1349         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1350
1351       if (dump_enabled_p ())
1352         {
1353           if (step_preserves_misalignment_p)
1354             dump_printf_loc (MSG_NOTE, vect_location,
1355                              "inner step divides the vector alignment.\n");
1356           else
1357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358                              "inner step doesn't divide the vector"
1359                              " alignment.\n");
1360         }
1361     }
1362
1363   /* Similarly we can only use base and misalignment information relative to
1364      an innermost loop if the misalignment stays the same throughout the
1365      execution of the loop.  As above, this is the case if the stride of
1366      the dataref evenly divides by the alignment.  */
1367   else
1368     {
1369       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370       step_preserves_misalignment_p
1371         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1372
1373       if (!step_preserves_misalignment_p && dump_enabled_p ())
1374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375                          "step doesn't divide the vector alignment.\n");
1376     }
1377
1378   unsigned int base_alignment = drb->base_alignment;
1379   unsigned int base_misalignment = drb->base_misalignment;
1380
1381   /* Calculate the maximum of the pooled base address alignment and the
1382      alignment that we can compute for DR itself.  */
1383   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1384     = base_alignments->get (drb->base_address);
1385   if (entry
1386       && base_alignment < (*entry).second->base_alignment
1387       && (loop_vinfo
1388           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1389                               gimple_bb (entry->first->stmt))
1390               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1391                   || (entry->first->dr_aux.group <= dr_info->group)))))
1392     {
1393       base_alignment = entry->second->base_alignment;
1394       base_misalignment = entry->second->base_misalignment;
1395     }
1396
1397   if (drb->offset_alignment < vect_align_c
1398       || !step_preserves_misalignment_p
1399       /* We need to know whether the step wrt the vectorized loop is
1400          negative when computing the starting misalignment below.  */
1401       || TREE_CODE (drb->step) != INTEGER_CST)
1402     {
1403       if (dump_enabled_p ())
1404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1405                          "Unknown alignment for access: %T\n", ref);
1406       return;
1407     }
1408
1409   if (base_alignment < vect_align_c)
1410     {
1411       unsigned int max_alignment;
1412       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1413       if (max_alignment < vect_align_c
1414           || !vect_can_force_dr_alignment_p (base,
1415                                              vect_align_c * BITS_PER_UNIT))
1416         {
1417           if (dump_enabled_p ())
1418             dump_printf_loc (MSG_NOTE, vect_location,
1419                              "can't force alignment of ref: %T\n", ref);
1420           return;
1421         }
1422
1423       /* Force the alignment of the decl.
1424          NOTE: This is the only change to the code we make during
1425          the analysis phase, before deciding to vectorize the loop.  */
1426       if (dump_enabled_p ())
1427         dump_printf_loc (MSG_NOTE, vect_location,
1428                          "force alignment of %T\n", ref);
1429
1430       dr_info->base_decl = base;
1431       dr_info->base_misaligned = true;
1432       base_misalignment = 0;
1433     }
1434   poly_int64 misalignment
1435     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1436
1437   unsigned int const_misalignment;
1438   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1439     {
1440       if (dump_enabled_p ())
1441         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                          "Non-constant misalignment for access: %T\n", ref);
1443       return;
1444     }
1445
1446   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1447
1448   if (dump_enabled_p ())
1449     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1450                      "misalign = %d bytes of ref %T\n",
1451                      const_misalignment, ref);
1452
1453   return;
1454 }
1455
1456 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1457    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1458    is made aligned via peeling.  */
1459
1460 static bool
1461 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1462                                          dr_vec_info *dr_peel_info)
1463 {
1464   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1465                   DR_TARGET_ALIGNMENT (dr_info)))
1466     {
1467       poly_offset_int diff
1468         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1469            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1470       if (known_eq (diff, 0)
1471           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1472         return true;
1473     }
1474   return false;
1475 }
1476
1477 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1478    aligned via peeling.  */
1479
1480 static bool
1481 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1482                                  dr_vec_info *dr_peel_info)
1483 {
1484   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1485                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1486       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1487                            DR_OFFSET (dr_peel_info->dr), 0)
1488       || !operand_equal_p (DR_STEP (dr_info->dr),
1489                            DR_STEP (dr_peel_info->dr), 0))
1490     return false;
1491
1492   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1493 }
1494
1495 /* Compute the value for dr_info->misalign so that the access appears
1496    aligned.  This is used by peeling to compensate for dr_misalignment
1497    applying the offset for negative step.  */
1498
1499 int
1500 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1501 {
1502   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1503     return 0;
1504
1505   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1506   poly_int64 misalignment
1507     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1508        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1509
1510   unsigned HOST_WIDE_INT target_alignment_c;
1511   int misalign;
1512   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1513       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1514     return DR_MISALIGNMENT_UNKNOWN;
1515   return misalign;
1516 }
1517
1518 /* Function vect_update_misalignment_for_peel.
1519    Sets DR_INFO's misalignment
1520    - to 0 if it has the same alignment as DR_PEEL_INFO,
1521    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1522    - to -1 (unknown) otherwise.
1523
1524    DR_INFO - the data reference whose misalignment is to be adjusted.
1525    DR_PEEL_INFO - the data reference whose misalignment is being made
1526                   zero in the vector loop by the peel.
1527    NPEEL - the number of iterations in the peel loop if the misalignment
1528            of DR_PEEL_INFO is known at compile time.  */
1529
1530 static void
1531 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1532                                    dr_vec_info *dr_peel_info, int npeel)
1533 {
1534   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1535   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1536     {
1537       SET_DR_MISALIGNMENT (dr_info,
1538                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1539       return;
1540     }
1541
1542   unsigned HOST_WIDE_INT alignment;
1543   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1544       && known_alignment_for_access_p (dr_info,
1545                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1546       && known_alignment_for_access_p (dr_peel_info,
1547                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1548     {
1549       int misal = dr_info->misalignment;
1550       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1551       misal &= alignment - 1;
1552       set_dr_misalignment (dr_info, misal);
1553       return;
1554     }
1555
1556   if (dump_enabled_p ())
1557     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1558                      "to unknown (-1).\n");
1559   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1560 }
1561
1562 /* Return true if alignment is relevant for DR_INFO.  */
1563
1564 static bool
1565 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1566 {
1567   stmt_vec_info stmt_info = dr_info->stmt;
1568
1569   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1570     return false;
1571
1572   /* For interleaving, only the alignment of the first access matters.  */
1573   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1574       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1575     return false;
1576
1577   /* Scatter-gather and invariant accesses continue to address individual
1578      scalars, so vector-level alignment is irrelevant.  */
1579   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1580       || integer_zerop (DR_STEP (dr_info->dr)))
1581     return false;
1582
1583   /* Strided accesses perform only component accesses, alignment is
1584      irrelevant for them.  */
1585   if (STMT_VINFO_STRIDED_P (stmt_info)
1586       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1587     return false;
1588
1589   return true;
1590 }
1591
1592 /* Given an memory reference EXP return whether its alignment is less
1593    than its size.  */
1594
1595 static bool
1596 not_size_aligned (tree exp)
1597 {
1598   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1599     return true;
1600
1601   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1602           > get_object_alignment (exp));
1603 }
1604
1605 /* Function vector_alignment_reachable_p
1606
1607    Return true if vector alignment for DR_INFO is reachable by peeling
1608    a few loop iterations.  Return false otherwise.  */
1609
1610 static bool
1611 vector_alignment_reachable_p (dr_vec_info *dr_info)
1612 {
1613   stmt_vec_info stmt_info = dr_info->stmt;
1614   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1615
1616   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1617     {
1618       /* For interleaved access we peel only if number of iterations in
1619          the prolog loop ({VF - misalignment}), is a multiple of the
1620          number of the interleaved accesses.  */
1621       int elem_size, mis_in_elements;
1622
1623       /* FORNOW: handle only known alignment.  */
1624       if (!known_alignment_for_access_p (dr_info, vectype))
1625         return false;
1626
1627       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1628       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1629       elem_size = vector_element_size (vector_size, nelements);
1630       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1631
1632       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1633         return false;
1634     }
1635
1636   /* If misalignment is known at the compile time then allow peeling
1637      only if natural alignment is reachable through peeling.  */
1638   if (known_alignment_for_access_p (dr_info, vectype)
1639       && !aligned_access_p (dr_info, vectype))
1640     {
1641       HOST_WIDE_INT elmsize =
1642                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1643       if (dump_enabled_p ())
1644         {
1645           dump_printf_loc (MSG_NOTE, vect_location,
1646                            "data size = %wd. misalignment = %d.\n", elmsize,
1647                            dr_misalignment (dr_info, vectype));
1648         }
1649       if (dr_misalignment (dr_info, vectype) % elmsize)
1650         {
1651           if (dump_enabled_p ())
1652             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653                              "data size does not divide the misalignment.\n");
1654           return false;
1655         }
1656     }
1657
1658   if (!known_alignment_for_access_p (dr_info, vectype))
1659     {
1660       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1661       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "Unknown misalignment, %snaturally aligned\n",
1665                          is_packed ? "not " : "");
1666       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1667     }
1668
1669   return true;
1670 }
1671
1672
1673 /* Calculate the cost of the memory access represented by DR_INFO.  */
1674
1675 static void
1676 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1677                            dr_alignment_support alignment_support_scheme,
1678                            int misalignment,
1679                            unsigned int *inside_cost,
1680                            unsigned int *outside_cost,
1681                            stmt_vector_for_cost *body_cost_vec,
1682                            stmt_vector_for_cost *prologue_cost_vec)
1683 {
1684   stmt_vec_info stmt_info = dr_info->stmt;
1685   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1686   int ncopies;
1687
1688   if (PURE_SLP_STMT (stmt_info))
1689     ncopies = 1;
1690   else
1691     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1692
1693   if (DR_IS_READ (dr_info->dr))
1694     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1695                         misalignment, true, inside_cost,
1696                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1697   else
1698     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1699                          misalignment, inside_cost, body_cost_vec);
1700
1701   if (dump_enabled_p ())
1702     dump_printf_loc (MSG_NOTE, vect_location,
1703                      "vect_get_data_access_cost: inside_cost = %d, "
1704                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1705 }
1706
1707
1708 typedef struct _vect_peel_info
1709 {
1710   dr_vec_info *dr_info;
1711   int npeel;
1712   unsigned int count;
1713 } *vect_peel_info;
1714
1715 typedef struct _vect_peel_extended_info
1716 {
1717   vec_info *vinfo;
1718   struct _vect_peel_info peel_info;
1719   unsigned int inside_cost;
1720   unsigned int outside_cost;
1721 } *vect_peel_extended_info;
1722
1723
1724 /* Peeling hashtable helpers.  */
1725
1726 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1727 {
1728   static inline hashval_t hash (const _vect_peel_info *);
1729   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1730 };
1731
1732 inline hashval_t
1733 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1734 {
1735   return (hashval_t) peel_info->npeel;
1736 }
1737
1738 inline bool
1739 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1740 {
1741   return (a->npeel == b->npeel);
1742 }
1743
1744
1745 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1746
1747 static void
1748 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1749                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1750                           int npeel, bool supportable_if_not_aligned)
1751 {
1752   struct _vect_peel_info elem, *slot;
1753   _vect_peel_info **new_slot;
1754
1755   elem.npeel = npeel;
1756   slot = peeling_htab->find (&elem);
1757   if (slot)
1758     slot->count++;
1759   else
1760     {
1761       slot = XNEW (struct _vect_peel_info);
1762       slot->npeel = npeel;
1763       slot->dr_info = dr_info;
1764       slot->count = 1;
1765       new_slot = peeling_htab->find_slot (slot, INSERT);
1766       *new_slot = slot;
1767     }
1768
1769   /* If this DR is not supported with unknown misalignment then bias
1770      this slot when the cost model is disabled.  */
1771   if (!supportable_if_not_aligned
1772       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1773     slot->count += VECT_MAX_COST;
1774 }
1775
1776
1777 /* Traverse peeling hash table to find peeling option that aligns maximum
1778    number of data accesses.  */
1779
1780 int
1781 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1782                                      _vect_peel_extended_info *max)
1783 {
1784   vect_peel_info elem = *slot;
1785
1786   if (elem->count > max->peel_info.count
1787       || (elem->count == max->peel_info.count
1788           && max->peel_info.npeel > elem->npeel))
1789     {
1790       max->peel_info.npeel = elem->npeel;
1791       max->peel_info.count = elem->count;
1792       max->peel_info.dr_info = elem->dr_info;
1793     }
1794
1795   return 1;
1796 }
1797
1798 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1799    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1800    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1801    after peeling.  */
1802
1803 static void
1804 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1805                                 dr_vec_info *dr0_info,
1806                                 unsigned int *inside_cost,
1807                                 unsigned int *outside_cost,
1808                                 stmt_vector_for_cost *body_cost_vec,
1809                                 stmt_vector_for_cost *prologue_cost_vec,
1810                                 unsigned int npeel)
1811 {
1812   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1813
1814   bool dr0_alignment_known_p
1815     = (dr0_info
1816        && known_alignment_for_access_p (dr0_info,
1817                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1818
1819   for (data_reference *dr : datarefs)
1820     {
1821       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1822       if (!vect_relevant_for_alignment_p (dr_info))
1823         continue;
1824
1825       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1826       dr_alignment_support alignment_support_scheme;
1827       int misalignment;
1828       unsigned HOST_WIDE_INT alignment;
1829
1830       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1831                                             size_zero_node) < 0;
1832       poly_int64 off = 0;
1833       if (negative)
1834         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1835                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1836
1837       if (npeel == 0)
1838         misalignment = dr_misalignment (dr_info, vectype, off);
1839       else if (dr_info == dr0_info
1840                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1841         misalignment = 0;
1842       else if (!dr0_alignment_known_p
1843                || !known_alignment_for_access_p (dr_info, vectype)
1844                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1845         misalignment = DR_MISALIGNMENT_UNKNOWN;
1846       else
1847         {
1848           misalignment = dr_misalignment (dr_info, vectype, off);
1849           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1850           misalignment &= alignment - 1;
1851         }
1852       alignment_support_scheme
1853         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1854                                          misalignment);
1855
1856       vect_get_data_access_cost (loop_vinfo, dr_info,
1857                                  alignment_support_scheme, misalignment,
1858                                  inside_cost, outside_cost,
1859                                  body_cost_vec, prologue_cost_vec);
1860     }
1861 }
1862
1863 /* Traverse peeling hash table and calculate cost for each peeling option.
1864    Find the one with the lowest cost.  */
1865
1866 int
1867 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1868                                    _vect_peel_extended_info *min)
1869 {
1870   vect_peel_info elem = *slot;
1871   int dummy;
1872   unsigned int inside_cost = 0, outside_cost = 0;
1873   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1874   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1875                        epilogue_cost_vec;
1876
1877   prologue_cost_vec.create (2);
1878   body_cost_vec.create (2);
1879   epilogue_cost_vec.create (2);
1880
1881   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1882                                   &outside_cost, &body_cost_vec,
1883                                   &prologue_cost_vec, elem->npeel);
1884
1885   body_cost_vec.release ();
1886
1887   outside_cost += vect_get_known_peeling_cost
1888     (loop_vinfo, elem->npeel, &dummy,
1889      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1890      &prologue_cost_vec, &epilogue_cost_vec);
1891
1892   /* Prologue and epilogue costs are added to the target model later.
1893      These costs depend only on the scalar iteration cost, the
1894      number of peeling iterations finally chosen, and the number of
1895      misaligned statements.  So discard the information found here.  */
1896   prologue_cost_vec.release ();
1897   epilogue_cost_vec.release ();
1898
1899   if (inside_cost < min->inside_cost
1900       || (inside_cost == min->inside_cost
1901           && outside_cost < min->outside_cost))
1902     {
1903       min->inside_cost = inside_cost;
1904       min->outside_cost = outside_cost;
1905       min->peel_info.dr_info = elem->dr_info;
1906       min->peel_info.npeel = elem->npeel;
1907       min->peel_info.count = elem->count;
1908     }
1909
1910   return 1;
1911 }
1912
1913
1914 /* Choose best peeling option by traversing peeling hash table and either
1915    choosing an option with the lowest cost (if cost model is enabled) or the
1916    option that aligns as many accesses as possible.  */
1917
1918 static struct _vect_peel_extended_info
1919 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1920                                        loop_vec_info loop_vinfo)
1921 {
1922    struct _vect_peel_extended_info res;
1923
1924    res.peel_info.dr_info = NULL;
1925    res.vinfo = loop_vinfo;
1926
1927    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1928      {
1929        res.inside_cost = INT_MAX;
1930        res.outside_cost = INT_MAX;
1931        peeling_htab->traverse <_vect_peel_extended_info *,
1932                                vect_peeling_hash_get_lowest_cost> (&res);
1933      }
1934    else
1935      {
1936        res.peel_info.count = 0;
1937        peeling_htab->traverse <_vect_peel_extended_info *,
1938                                vect_peeling_hash_get_most_frequent> (&res);
1939        res.inside_cost = 0;
1940        res.outside_cost = 0;
1941      }
1942
1943    return res;
1944 }
1945
1946 /* Return true if the new peeling NPEEL is supported.  */
1947
1948 static bool
1949 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1950                           unsigned npeel)
1951 {
1952   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1953   enum dr_alignment_support supportable_dr_alignment;
1954
1955   bool dr0_alignment_known_p
1956     = known_alignment_for_access_p (dr0_info,
1957                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1958
1959   /* Ensure that all data refs can be vectorized after the peel.  */
1960   for (data_reference *dr : datarefs)
1961     {
1962       if (dr == dr0_info->dr)
1963         continue;
1964
1965       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1966       if (!vect_relevant_for_alignment_p (dr_info)
1967           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1968         continue;
1969
1970       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1971       int misalignment;
1972       unsigned HOST_WIDE_INT alignment;
1973       if (!dr0_alignment_known_p
1974           || !known_alignment_for_access_p (dr_info, vectype)
1975           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1976         misalignment = DR_MISALIGNMENT_UNKNOWN;
1977       else
1978         {
1979           misalignment = dr_misalignment (dr_info, vectype);
1980           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1981           misalignment &= alignment - 1;
1982         }
1983       supportable_dr_alignment
1984         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1985                                          misalignment);
1986       if (supportable_dr_alignment == dr_unaligned_unsupported)
1987         return false;
1988     }
1989
1990   return true;
1991 }
1992
1993 /* Compare two data-references DRA and DRB to group them into chunks
1994    with related alignment.  */
1995
1996 static int
1997 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1998 {
1999   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2000   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2001   int cmp;
2002
2003   /* Stabilize sort.  */
2004   if (dra == drb)
2005     return 0;
2006
2007   /* Ordering of DRs according to base.  */
2008   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2009                                DR_BASE_ADDRESS (drb));
2010   if (cmp != 0)
2011     return cmp;
2012
2013   /* And according to DR_OFFSET.  */
2014   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2015   if (cmp != 0)
2016     return cmp;
2017
2018   /* And after step.  */
2019   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2020   if (cmp != 0)
2021     return cmp;
2022
2023   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2024   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2025   if (cmp == 0)
2026     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2027   return cmp;
2028 }
2029
2030 /* Function vect_enhance_data_refs_alignment
2031
2032    This pass will use loop versioning and loop peeling in order to enhance
2033    the alignment of data references in the loop.
2034
2035    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2036    original loop is to be vectorized.  Any other loops that are created by
2037    the transformations performed in this pass - are not supposed to be
2038    vectorized.  This restriction will be relaxed.
2039
2040    This pass will require a cost model to guide it whether to apply peeling
2041    or versioning or a combination of the two.  For example, the scheme that
2042    intel uses when given a loop with several memory accesses, is as follows:
2043    choose one memory access ('p') which alignment you want to force by doing
2044    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2045    other accesses are not necessarily aligned, or (2) use loop versioning to
2046    generate one loop in which all accesses are aligned, and another loop in
2047    which only 'p' is necessarily aligned.
2048
2049    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2050    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2051    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2052
2053    Devising a cost model is the most critical aspect of this work.  It will
2054    guide us on which access to peel for, whether to use loop versioning, how
2055    many versions to create, etc.  The cost model will probably consist of
2056    generic considerations as well as target specific considerations (on
2057    powerpc for example, misaligned stores are more painful than misaligned
2058    loads).
2059
2060    Here are the general steps involved in alignment enhancements:
2061
2062      -- original loop, before alignment analysis:
2063         for (i=0; i<N; i++){
2064           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2065           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2066         }
2067
2068      -- After vect_compute_data_refs_alignment:
2069         for (i=0; i<N; i++){
2070           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2071           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2072         }
2073
2074      -- Possibility 1: we do loop versioning:
2075      if (p is aligned) {
2076         for (i=0; i<N; i++){    # loop 1A
2077           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2078           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2079         }
2080      }
2081      else {
2082         for (i=0; i<N; i++){    # loop 1B
2083           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2084           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2085         }
2086      }
2087
2088      -- Possibility 2: we do loop peeling:
2089      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2090         x = q[i];
2091         p[i] = y;
2092      }
2093      for (i = 3; i < N; i++){   # loop 2A
2094         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2095         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2096      }
2097
2098      -- Possibility 3: combination of loop peeling and versioning:
2099      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2100         x = q[i];
2101         p[i] = y;
2102      }
2103      if (p is aligned) {
2104         for (i = 3; i<N; i++){  # loop 3A
2105           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2106           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2107         }
2108      }
2109      else {
2110         for (i = 3; i<N; i++){  # loop 3B
2111           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2112           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2113         }
2114      }
2115
2116      These loops are later passed to loop_transform to be vectorized.  The
2117      vectorizer will use the alignment information to guide the transformation
2118      (whether to generate regular loads/stores, or with special handling for
2119      misalignment).  */
2120
2121 opt_result
2122 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2123 {
2124   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2125   dr_vec_info *first_store = NULL;
2126   dr_vec_info *dr0_info = NULL;
2127   struct data_reference *dr;
2128   unsigned int i;
2129   bool do_peeling = false;
2130   bool do_versioning = false;
2131   unsigned int npeel = 0;
2132   bool one_misalignment_known = false;
2133   bool one_misalignment_unknown = false;
2134   bool one_dr_unsupportable = false;
2135   dr_vec_info *unsupportable_dr_info = NULL;
2136   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2137   hash_table<peel_info_hasher> peeling_htab (1);
2138
2139   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2140
2141   /* Reset data so we can safely be called multiple times.  */
2142   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2143   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2144
2145   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2146     return opt_result::success ();
2147
2148   /* Sort the vector of datarefs so DRs that have the same or dependent
2149      alignment are next to each other.  */
2150   auto_vec<data_reference_p> datarefs
2151     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2152   datarefs.qsort (dr_align_group_sort_cmp);
2153
2154   /* Compute the number of DRs that become aligned when we peel
2155      a dataref so it becomes aligned.  */
2156   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2157   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2158   unsigned i0;
2159   for (i0 = 0; i0 < datarefs.length (); ++i0)
2160     if (DR_BASE_ADDRESS (datarefs[i0]))
2161       break;
2162   for (i = i0 + 1; i <= datarefs.length (); ++i)
2163     {
2164       if (i == datarefs.length ()
2165           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2166                                DR_BASE_ADDRESS (datarefs[i]), 0)
2167           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2168                                DR_OFFSET (datarefs[i]), 0)
2169           || !operand_equal_p (DR_STEP (datarefs[i0]),
2170                                DR_STEP (datarefs[i]), 0))
2171         {
2172           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2173              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2174              will get known misalignment if we align one of the refs
2175              with the largest DR_TARGET_ALIGNMENT.  */
2176           for (unsigned j = i0; j < i; ++j)
2177             {
2178               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2179               for (unsigned k = i0; k < i; ++k)
2180                 {
2181                   if (k == j)
2182                     continue;
2183                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2184                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2185                                                                dr_infoj))
2186                     n_same_align_refs[j]++;
2187                 }
2188             }
2189           i0 = i;
2190         }
2191     }
2192
2193   /* While cost model enhancements are expected in the future, the high level
2194      view of the code at this time is as follows:
2195
2196      A) If there is a misaligned access then see if peeling to align
2197         this access can make all data references satisfy
2198         vect_supportable_dr_alignment.  If so, update data structures
2199         as needed and return true.
2200
2201      B) If peeling wasn't possible and there is a data reference with an
2202         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2203         then see if loop versioning checks can be used to make all data
2204         references satisfy vect_supportable_dr_alignment.  If so, update
2205         data structures as needed and return true.
2206
2207      C) If neither peeling nor versioning were successful then return false if
2208         any data reference does not satisfy vect_supportable_dr_alignment.
2209
2210      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2211
2212      Note, Possibility 3 above (which is peeling and versioning together) is not
2213      being done at this time.  */
2214
2215   /* (1) Peeling to force alignment.  */
2216
2217   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2218      Considerations:
2219      + How many accesses will become aligned due to the peeling
2220      - How many accesses will become unaligned due to the peeling,
2221        and the cost of misaligned accesses.
2222      - The cost of peeling (the extra runtime checks, the increase
2223        in code size).  */
2224
2225   FOR_EACH_VEC_ELT (datarefs, i, dr)
2226     {
2227       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2228       if (!vect_relevant_for_alignment_p (dr_info))
2229         continue;
2230
2231       stmt_vec_info stmt_info = dr_info->stmt;
2232       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2233       do_peeling = vector_alignment_reachable_p (dr_info);
2234       if (do_peeling)
2235         {
2236           if (known_alignment_for_access_p (dr_info, vectype))
2237             {
2238               unsigned int npeel_tmp = 0;
2239               bool negative = tree_int_cst_compare (DR_STEP (dr),
2240                                                     size_zero_node) < 0;
2241
2242               /* If known_alignment_for_access_p then we have set
2243                  DR_MISALIGNMENT which is only done if we know it at compiler
2244                  time, so it is safe to assume target alignment is constant.
2245                */
2246               unsigned int target_align =
2247                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2248               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2249               poly_int64 off = 0;
2250               if (negative)
2251                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2252               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2253               mis = negative ? mis : -mis;
2254               if (mis != 0)
2255                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2256
2257               /* For multiple types, it is possible that the bigger type access
2258                  will have more than one peeling option.  E.g., a loop with two
2259                  types: one of size (vector size / 4), and the other one of
2260                  size (vector size / 8).  Vectorization factor will 8.  If both
2261                  accesses are misaligned by 3, the first one needs one scalar
2262                  iteration to be aligned, and the second one needs 5.  But the
2263                  first one will be aligned also by peeling 5 scalar
2264                  iterations, and in that case both accesses will be aligned.
2265                  Hence, except for the immediate peeling amount, we also want
2266                  to try to add full vector size, while we don't exceed
2267                  vectorization factor.
2268                  We do this automatically for cost model, since we calculate
2269                  cost for every peeling option.  */
2270               poly_uint64 nscalars = npeel_tmp;
2271               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2272                 {
2273                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2274                   nscalars = (STMT_SLP_TYPE (stmt_info)
2275                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2276                 }
2277
2278               /* Save info about DR in the hash table.  Also include peeling
2279                  amounts according to the explanation above.  Indicate
2280                  the alignment status when the ref is not aligned.
2281                  ???  Rather than using unknown alignment here we should
2282                  prune all entries from the peeling hashtable which cause
2283                  DRs to be not supported.  */
2284               bool supportable_if_not_aligned
2285                 = vect_supportable_dr_alignment
2286                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2287               while (known_le (npeel_tmp, nscalars))
2288                 {
2289                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2290                                             dr_info, npeel_tmp,
2291                                             supportable_if_not_aligned);
2292                   npeel_tmp += MAX (1, target_align / dr_size);
2293                 }
2294
2295               one_misalignment_known = true;
2296             }
2297           else
2298             {
2299               /* If we don't know any misalignment values, we prefer
2300                  peeling for data-ref that has the maximum number of data-refs
2301                  with the same alignment, unless the target prefers to align
2302                  stores over load.  */
2303               unsigned same_align_drs = n_same_align_refs[i];
2304               if (!dr0_info
2305                   || dr0_same_align_drs < same_align_drs)
2306                 {
2307                   dr0_same_align_drs = same_align_drs;
2308                   dr0_info = dr_info;
2309                 }
2310               /* For data-refs with the same number of related
2311                  accesses prefer the one where the misalign
2312                  computation will be invariant in the outermost loop.  */
2313               else if (dr0_same_align_drs == same_align_drs)
2314                 {
2315                   class loop *ivloop0, *ivloop;
2316                   ivloop0 = outermost_invariant_loop_for_expr
2317                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2318                   ivloop = outermost_invariant_loop_for_expr
2319                     (loop, DR_BASE_ADDRESS (dr));
2320                   if ((ivloop && !ivloop0)
2321                       || (ivloop && ivloop0
2322                           && flow_loop_nested_p (ivloop, ivloop0)))
2323                     dr0_info = dr_info;
2324                 }
2325
2326               one_misalignment_unknown = true;
2327
2328               /* Check for data refs with unsupportable alignment that
2329                  can be peeled.  */
2330               enum dr_alignment_support supportable_dr_alignment
2331                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2332                                                  DR_MISALIGNMENT_UNKNOWN);
2333               if (supportable_dr_alignment == dr_unaligned_unsupported)
2334                 {
2335                   one_dr_unsupportable = true;
2336                   unsupportable_dr_info = dr_info;
2337                 }
2338
2339               if (!first_store && DR_IS_WRITE (dr))
2340                 {
2341                   first_store = dr_info;
2342                   first_store_same_align_drs = same_align_drs;
2343                 }
2344             }
2345         }
2346       else
2347         {
2348           if (!aligned_access_p (dr_info, vectype))
2349             {
2350               if (dump_enabled_p ())
2351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                                  "vector alignment may not be reachable\n");
2353               break;
2354             }
2355         }
2356     }
2357
2358   /* Check if we can possibly peel the loop.  */
2359   if (!vect_can_advance_ivs_p (loop_vinfo)
2360       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2361                                        loop_preheader_edge (loop))
2362       || loop->inner
2363       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2364     do_peeling = false;
2365
2366   struct _vect_peel_extended_info peel_for_known_alignment;
2367   struct _vect_peel_extended_info peel_for_unknown_alignment;
2368   struct _vect_peel_extended_info best_peel;
2369
2370   peel_for_unknown_alignment.inside_cost = INT_MAX;
2371   peel_for_unknown_alignment.outside_cost = INT_MAX;
2372   peel_for_unknown_alignment.peel_info.count = 0;
2373
2374   if (do_peeling
2375       && one_misalignment_unknown)
2376     {
2377       /* Check if the target requires to prefer stores over loads, i.e., if
2378          misaligned stores are more expensive than misaligned loads (taking
2379          drs with same alignment into account).  */
2380       unsigned int load_inside_cost = 0;
2381       unsigned int load_outside_cost = 0;
2382       unsigned int store_inside_cost = 0;
2383       unsigned int store_outside_cost = 0;
2384       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2385
2386       stmt_vector_for_cost dummy;
2387       dummy.create (2);
2388       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2389                                       &load_inside_cost,
2390                                       &load_outside_cost,
2391                                       &dummy, &dummy, estimated_npeels);
2392       dummy.release ();
2393
2394       if (first_store)
2395         {
2396           dummy.create (2);
2397           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2398                                           &store_inside_cost,
2399                                           &store_outside_cost,
2400                                           &dummy, &dummy,
2401                                           estimated_npeels);
2402           dummy.release ();
2403         }
2404       else
2405         {
2406           store_inside_cost = INT_MAX;
2407           store_outside_cost = INT_MAX;
2408         }
2409
2410       if (load_inside_cost > store_inside_cost
2411           || (load_inside_cost == store_inside_cost
2412               && load_outside_cost > store_outside_cost))
2413         {
2414           dr0_info = first_store;
2415           dr0_same_align_drs = first_store_same_align_drs;
2416           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2417           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2418         }
2419       else
2420         {
2421           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2422           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2423         }
2424
2425       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2426       prologue_cost_vec.create (2);
2427       epilogue_cost_vec.create (2);
2428
2429       int dummy2;
2430       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2431         (loop_vinfo, estimated_npeels, &dummy2,
2432          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2433          &prologue_cost_vec, &epilogue_cost_vec);
2434
2435       prologue_cost_vec.release ();
2436       epilogue_cost_vec.release ();
2437
2438       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2439     }
2440
2441   peel_for_unknown_alignment.peel_info.npeel = 0;
2442   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2443
2444   best_peel = peel_for_unknown_alignment;
2445
2446   peel_for_known_alignment.inside_cost = INT_MAX;
2447   peel_for_known_alignment.outside_cost = INT_MAX;
2448   peel_for_known_alignment.peel_info.count = 0;
2449   peel_for_known_alignment.peel_info.dr_info = NULL;
2450
2451   if (do_peeling && one_misalignment_known)
2452     {
2453       /* Peeling is possible, but there is no data access that is not supported
2454          unless aligned.  So we try to choose the best possible peeling from
2455          the hash table.  */
2456       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2457         (&peeling_htab, loop_vinfo);
2458     }
2459
2460   /* Compare costs of peeling for known and unknown alignment. */
2461   if (peel_for_known_alignment.peel_info.dr_info != NULL
2462       && peel_for_unknown_alignment.inside_cost
2463       >= peel_for_known_alignment.inside_cost)
2464     {
2465       best_peel = peel_for_known_alignment;
2466
2467       /* If the best peeling for known alignment has NPEEL == 0, perform no
2468          peeling at all except if there is an unsupportable dr that we can
2469          align.  */
2470       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2471         do_peeling = false;
2472     }
2473
2474   /* If there is an unsupportable data ref, prefer this over all choices so far
2475      since we'd have to discard a chosen peeling except when it accidentally
2476      aligned the unsupportable data ref.  */
2477   if (one_dr_unsupportable)
2478     dr0_info = unsupportable_dr_info;
2479   else if (do_peeling)
2480     {
2481       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2482          TODO: Use nopeel_outside_cost or get rid of it?  */
2483       unsigned nopeel_inside_cost = 0;
2484       unsigned nopeel_outside_cost = 0;
2485
2486       stmt_vector_for_cost dummy;
2487       dummy.create (2);
2488       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2489                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2490       dummy.release ();
2491
2492       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2493          costs will be recorded.  */
2494       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2495       prologue_cost_vec.create (2);
2496       epilogue_cost_vec.create (2);
2497
2498       int dummy2;
2499       nopeel_outside_cost += vect_get_known_peeling_cost
2500         (loop_vinfo, 0, &dummy2,
2501          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2502          &prologue_cost_vec, &epilogue_cost_vec);
2503
2504       prologue_cost_vec.release ();
2505       epilogue_cost_vec.release ();
2506
2507       npeel = best_peel.peel_info.npeel;
2508       dr0_info = best_peel.peel_info.dr_info;
2509
2510       /* If no peeling is not more expensive than the best peeling we
2511          have so far, don't perform any peeling.  */
2512       if (nopeel_inside_cost <= best_peel.inside_cost)
2513         do_peeling = false;
2514     }
2515
2516   if (do_peeling)
2517     {
2518       stmt_vec_info stmt_info = dr0_info->stmt;
2519       if (known_alignment_for_access_p (dr0_info,
2520                                         STMT_VINFO_VECTYPE (stmt_info)))
2521         {
2522           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2523                                                 size_zero_node) < 0;
2524           if (!npeel)
2525             {
2526               /* Since it's known at compile time, compute the number of
2527                  iterations in the peeled loop (the peeling factor) for use in
2528                  updating DR_MISALIGNMENT values.  The peeling factor is the
2529                  vectorization factor minus the misalignment as an element
2530                  count.  */
2531               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2532               poly_int64 off = 0;
2533               if (negative)
2534                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2535                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2536               unsigned int mis
2537                 = dr_misalignment (dr0_info, vectype, off);
2538               mis = negative ? mis : -mis;
2539               /* If known_alignment_for_access_p then we have set
2540                  DR_MISALIGNMENT which is only done if we know it at compiler
2541                  time, so it is safe to assume target alignment is constant.
2542                */
2543               unsigned int target_align =
2544                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2545               npeel = ((mis & (target_align - 1))
2546                        / vect_get_scalar_dr_size (dr0_info));
2547             }
2548
2549           /* For interleaved data access every iteration accesses all the
2550              members of the group, therefore we divide the number of iterations
2551              by the group size.  */
2552           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2553             npeel /= DR_GROUP_SIZE (stmt_info);
2554
2555           if (dump_enabled_p ())
2556             dump_printf_loc (MSG_NOTE, vect_location,
2557                              "Try peeling by %d\n", npeel);
2558         }
2559
2560       /* Ensure that all datarefs can be vectorized after the peel.  */
2561       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2562         do_peeling = false;
2563
2564       /* Check if all datarefs are supportable and log.  */
2565       if (do_peeling
2566           && npeel == 0
2567           && known_alignment_for_access_p (dr0_info,
2568                                            STMT_VINFO_VECTYPE (stmt_info)))
2569         return opt_result::success ();
2570
2571       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2572       if (do_peeling)
2573         {
2574           unsigned max_allowed_peel
2575             = param_vect_max_peeling_for_alignment;
2576           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2577             max_allowed_peel = 0;
2578           if (max_allowed_peel != (unsigned)-1)
2579             {
2580               unsigned max_peel = npeel;
2581               if (max_peel == 0)
2582                 {
2583                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2584                   unsigned HOST_WIDE_INT target_align_c;
2585                   if (target_align.is_constant (&target_align_c))
2586                     max_peel =
2587                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2588                   else
2589                     {
2590                       do_peeling = false;
2591                       if (dump_enabled_p ())
2592                         dump_printf_loc (MSG_NOTE, vect_location,
2593                           "Disable peeling, max peels set and vector"
2594                           " alignment unknown\n");
2595                     }
2596                 }
2597               if (max_peel > max_allowed_peel)
2598                 {
2599                   do_peeling = false;
2600                   if (dump_enabled_p ())
2601                     dump_printf_loc (MSG_NOTE, vect_location,
2602                         "Disable peeling, max peels reached: %d\n", max_peel);
2603                 }
2604             }
2605         }
2606
2607       /* Cost model #2 - if peeling may result in a remaining loop not
2608          iterating enough to be vectorized then do not peel.  Since this
2609          is a cost heuristic rather than a correctness decision, use the
2610          most likely runtime value for variable vectorization factors.  */
2611       if (do_peeling
2612           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2613         {
2614           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2615           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2616           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2617               < assumed_vf + max_peel)
2618             do_peeling = false;
2619         }
2620
2621       if (do_peeling)
2622         {
2623           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2624              If the misalignment of DR_i is identical to that of dr0 then set
2625              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2626              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2627              by the peeling factor times the element size of DR_i (MOD the
2628              vectorization factor times the size).  Otherwise, the
2629              misalignment of DR_i must be set to unknown.  */
2630           FOR_EACH_VEC_ELT (datarefs, i, dr)
2631             if (dr != dr0_info->dr)
2632               {
2633                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2634                 if (!vect_relevant_for_alignment_p (dr_info))
2635                   continue;
2636
2637                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2638               }
2639
2640           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2641           if (npeel)
2642             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2643           else
2644             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2645           SET_DR_MISALIGNMENT (dr0_info,
2646                                vect_dr_misalign_for_aligned_access (dr0_info));
2647           if (dump_enabled_p ())
2648             {
2649               dump_printf_loc (MSG_NOTE, vect_location,
2650                                "Alignment of access forced using peeling.\n");
2651               dump_printf_loc (MSG_NOTE, vect_location,
2652                                "Peeling for alignment will be applied.\n");
2653             }
2654
2655           /* The inside-loop cost will be accounted for in vectorizable_load
2656              and vectorizable_store correctly with adjusted alignments.
2657              Drop the body_cst_vec on the floor here.  */
2658           return opt_result::success ();
2659         }
2660     }
2661
2662   /* (2) Versioning to force alignment.  */
2663
2664   /* Try versioning if:
2665      1) optimize loop for speed and the cost-model is not cheap
2666      2) there is at least one unsupported misaligned data ref with an unknown
2667         misalignment, and
2668      3) all misaligned data refs with a known misalignment are supported, and
2669      4) the number of runtime alignment checks is within reason.  */
2670
2671   do_versioning
2672     = (optimize_loop_nest_for_speed_p (loop)
2673        && !loop->inner /* FORNOW */
2674        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2675
2676   if (do_versioning)
2677     {
2678       FOR_EACH_VEC_ELT (datarefs, i, dr)
2679         {
2680           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2681           if (!vect_relevant_for_alignment_p (dr_info))
2682             continue;
2683
2684           stmt_vec_info stmt_info = dr_info->stmt;
2685           if (STMT_VINFO_STRIDED_P (stmt_info))
2686             {
2687               do_versioning = false;
2688               break;
2689             }
2690
2691           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2692           bool negative = tree_int_cst_compare (DR_STEP (dr),
2693                                                 size_zero_node) < 0;
2694           poly_int64 off = 0;
2695           if (negative)
2696             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2697                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2698           int misalignment;
2699           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2700             continue;
2701
2702           enum dr_alignment_support supportable_dr_alignment
2703             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2704                                              misalignment);
2705           if (supportable_dr_alignment == dr_unaligned_unsupported)
2706             {
2707               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2708                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2709                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2710                 {
2711                   do_versioning = false;
2712                   break;
2713                 }
2714
2715               /* At present we don't support versioning for alignment
2716                  with variable VF, since there's no guarantee that the
2717                  VF is a power of two.  We could relax this if we added
2718                  a way of enforcing a power-of-two size.  */
2719               unsigned HOST_WIDE_INT size;
2720               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2721                 {
2722                   do_versioning = false;
2723                   break;
2724                 }
2725
2726               /* Forcing alignment in the first iteration is no good if
2727                  we don't keep it across iterations.  For now, just disable
2728                  versioning in this case.
2729                  ?? We could actually unroll the loop to achieve the required
2730                  overall step alignment, and forcing the alignment could be
2731                  done by doing some iterations of the non-vectorized loop.  */
2732               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2733                                * DR_STEP_ALIGNMENT (dr),
2734                                DR_TARGET_ALIGNMENT (dr_info)))
2735                 {
2736                   do_versioning = false;
2737                   break;
2738                 }
2739
2740               /* The rightmost bits of an aligned address must be zeros.
2741                  Construct the mask needed for this test.  For example,
2742                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2743                  mask must be 15 = 0xf. */
2744               int mask = size - 1;
2745
2746               /* FORNOW: use the same mask to test all potentially unaligned
2747                  references in the loop.  */
2748               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2749                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2750                 {
2751                   do_versioning = false;
2752                   break;
2753                 }
2754
2755               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2756               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2757             }
2758         }
2759
2760       /* Versioning requires at least one misaligned data reference.  */
2761       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2762         do_versioning = false;
2763       else if (!do_versioning)
2764         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2765     }
2766
2767   if (do_versioning)
2768     {
2769       const vec<stmt_vec_info> &may_misalign_stmts
2770         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2771       stmt_vec_info stmt_info;
2772
2773       /* It can now be assumed that the data references in the statements
2774          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2775          of the loop being vectorized.  */
2776       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2777         {
2778           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2779           SET_DR_MISALIGNMENT (dr_info,
2780                                vect_dr_misalign_for_aligned_access (dr_info));
2781           if (dump_enabled_p ())
2782             dump_printf_loc (MSG_NOTE, vect_location,
2783                              "Alignment of access forced using versioning.\n");
2784         }
2785
2786       if (dump_enabled_p ())
2787         dump_printf_loc (MSG_NOTE, vect_location,
2788                          "Versioning for alignment will be applied.\n");
2789
2790       /* Peeling and versioning can't be done together at this time.  */
2791       gcc_assert (! (do_peeling && do_versioning));
2792
2793       return opt_result::success ();
2794     }
2795
2796   /* This point is reached if neither peeling nor versioning is being done.  */
2797   gcc_assert (! (do_peeling || do_versioning));
2798
2799   return opt_result::success ();
2800 }
2801
2802
2803 /* Function vect_analyze_data_refs_alignment
2804
2805    Analyze the alignment of the data-references in the loop.
2806    Return FALSE if a data reference is found that cannot be vectorized.  */
2807
2808 opt_result
2809 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2810 {
2811   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2812
2813   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2814   struct data_reference *dr;
2815   unsigned int i;
2816
2817   vect_record_base_alignments (loop_vinfo);
2818   FOR_EACH_VEC_ELT (datarefs, i, dr)
2819     {
2820       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2821       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2822         {
2823           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2824               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2825             continue;
2826           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2827                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2828         }
2829     }
2830
2831   return opt_result::success ();
2832 }
2833
2834
2835 /* Analyze alignment of DRs of stmts in NODE.  */
2836
2837 static bool
2838 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2839 {
2840   /* Alignment is maintained in the first element of the group.  */
2841   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2842   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2843   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2844   tree vectype = SLP_TREE_VECTYPE (node);
2845   poly_uint64 vector_alignment
2846     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2847                  BITS_PER_UNIT);
2848   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2849     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2850   /* Re-analyze alignment when we're facing a vectorization with a bigger
2851      alignment requirement.  */
2852   else if (known_lt (dr_info->target_alignment, vector_alignment))
2853     {
2854       poly_uint64 old_target_alignment = dr_info->target_alignment;
2855       int old_misalignment = dr_info->misalignment;
2856       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2857       /* But keep knowledge about a smaller alignment.  */
2858       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2859           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2860         {
2861           dr_info->target_alignment = old_target_alignment;
2862           dr_info->misalignment = old_misalignment;
2863         }
2864     }
2865   /* When we ever face unordered target alignments the first one wins in terms
2866      of analyzing and the other will become unknown in dr_misalignment.  */
2867   return true;
2868 }
2869
2870 /* Function vect_slp_analyze_instance_alignment
2871
2872    Analyze the alignment of the data-references in the SLP instance.
2873    Return FALSE if a data reference is found that cannot be vectorized.  */
2874
2875 bool
2876 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2877                                                 slp_instance instance)
2878 {
2879   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2880
2881   slp_tree node;
2882   unsigned i;
2883   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2884     if (! vect_slp_analyze_node_alignment (vinfo, node))
2885       return false;
2886
2887   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2888       && ! vect_slp_analyze_node_alignment
2889              (vinfo, SLP_INSTANCE_TREE (instance)))
2890     return false;
2891
2892   return true;
2893 }
2894
2895
2896 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2897    accesses of legal size, step, etc.  Detect gaps, single element
2898    interleaving, and other special cases. Set grouped access info.
2899    Collect groups of strided stores for further use in SLP analysis.
2900    Worker for vect_analyze_group_access.  */
2901
2902 static bool
2903 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2904 {
2905   data_reference *dr = dr_info->dr;
2906   tree step = DR_STEP (dr);
2907   tree scalar_type = TREE_TYPE (DR_REF (dr));
2908   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2909   stmt_vec_info stmt_info = dr_info->stmt;
2910   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2911   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2912   HOST_WIDE_INT dr_step = -1;
2913   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2914   bool slp_impossible = false;
2915
2916   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2917      size of the interleaving group (including gaps).  */
2918   if (tree_fits_shwi_p (step))
2919     {
2920       dr_step = tree_to_shwi (step);
2921       /* Check that STEP is a multiple of type size.  Otherwise there is
2922          a non-element-sized gap at the end of the group which we
2923          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2924          ???  As we can handle non-constant step fine here we should
2925          simply remove uses of DR_GROUP_GAP between the last and first
2926          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2927          simply not include that gap.  */
2928       if ((dr_step % type_size) != 0)
2929         {
2930           if (dump_enabled_p ())
2931             dump_printf_loc (MSG_NOTE, vect_location,
2932                              "Step %T is not a multiple of the element size"
2933                              " for %T\n",
2934                              step, DR_REF (dr));
2935           return false;
2936         }
2937       groupsize = absu_hwi (dr_step) / type_size;
2938     }
2939   else
2940     groupsize = 0;
2941
2942   /* Not consecutive access is possible only if it is a part of interleaving.  */
2943   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2944     {
2945       /* Check if it this DR is a part of interleaving, and is a single
2946          element of the group that is accessed in the loop.  */
2947
2948       /* Gaps are supported only for loads. STEP must be a multiple of the type
2949          size.  */
2950       if (DR_IS_READ (dr)
2951           && (dr_step % type_size) == 0
2952           && groupsize > 0
2953           /* This could be UINT_MAX but as we are generating code in a very
2954              inefficient way we have to cap earlier.
2955              See PR91403 for example.  */
2956           && groupsize <= 4096)
2957         {
2958           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2959           DR_GROUP_SIZE (stmt_info) = groupsize;
2960           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2961           if (dump_enabled_p ())
2962             dump_printf_loc (MSG_NOTE, vect_location,
2963                              "Detected single element interleaving %T"
2964                              " step %T\n",
2965                              DR_REF (dr), step);
2966
2967           return true;
2968         }
2969
2970       if (dump_enabled_p ())
2971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972                          "not consecutive access %G", stmt_info->stmt);
2973
2974       if (bb_vinfo)
2975         {
2976           /* Mark the statement as unvectorizable.  */
2977           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2978           return true;
2979         }
2980
2981       if (dump_enabled_p ())
2982         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2983       STMT_VINFO_STRIDED_P (stmt_info) = true;
2984       return true;
2985     }
2986
2987   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2988     {
2989       /* First stmt in the interleaving chain. Check the chain.  */
2990       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2991       struct data_reference *data_ref = dr;
2992       unsigned int count = 1;
2993       tree prev_init = DR_INIT (data_ref);
2994       HOST_WIDE_INT diff, gaps = 0;
2995
2996       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2997       while (next)
2998         {
2999           /* We never have the same DR multiple times.  */
3000           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3001                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3002
3003           data_ref = STMT_VINFO_DATA_REF (next);
3004
3005           /* All group members have the same STEP by construction.  */
3006           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3007
3008           /* Check that the distance between two accesses is equal to the type
3009              size. Otherwise, we have gaps.  */
3010           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3011                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3012           if (diff < 1 || diff > UINT_MAX)
3013             {
3014               /* For artificial testcases with array accesses with large
3015                  constant indices we can run into overflow issues which
3016                  can end up fooling the groupsize constraint below so
3017                  check the individual gaps (which are represented as
3018                  unsigned int) as well.  */
3019               if (dump_enabled_p ())
3020                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3021                                  "interleaved access with gap larger "
3022                                  "than representable\n");
3023               return false;
3024             }
3025           if (diff != 1)
3026             {
3027               /* FORNOW: SLP of accesses with gaps is not supported.  */
3028               slp_impossible = true;
3029               if (DR_IS_WRITE (data_ref))
3030                 {
3031                   if (dump_enabled_p ())
3032                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3033                                      "interleaved store with gaps\n");
3034                   return false;
3035                 }
3036
3037               gaps += diff - 1;
3038             }
3039
3040           last_accessed_element += diff;
3041
3042           /* Store the gap from the previous member of the group. If there is no
3043              gap in the access, DR_GROUP_GAP is always 1.  */
3044           DR_GROUP_GAP (next) = diff;
3045
3046           prev_init = DR_INIT (data_ref);
3047           next = DR_GROUP_NEXT_ELEMENT (next);
3048           /* Count the number of data-refs in the chain.  */
3049           count++;
3050         }
3051
3052       if (groupsize == 0)
3053         groupsize = count + gaps;
3054
3055       /* This could be UINT_MAX but as we are generating code in a very
3056          inefficient way we have to cap earlier.  See PR78699 for example.  */
3057       if (groupsize > 4096)
3058         {
3059           if (dump_enabled_p ())
3060             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3061                              "group is too large\n");
3062           return false;
3063         }
3064
3065       /* Check that the size of the interleaving is equal to count for stores,
3066          i.e., that there are no gaps.  */
3067       if (groupsize != count
3068           && !DR_IS_READ (dr))
3069         {
3070           groupsize = count;
3071           STMT_VINFO_STRIDED_P (stmt_info) = true;
3072         }
3073
3074       /* If there is a gap after the last load in the group it is the
3075          difference between the groupsize and the last accessed
3076          element.
3077          When there is no gap, this difference should be 0.  */
3078       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3079
3080       DR_GROUP_SIZE (stmt_info) = groupsize;
3081       if (dump_enabled_p ())
3082         {
3083           dump_printf_loc (MSG_NOTE, vect_location,
3084                            "Detected interleaving ");
3085           if (DR_IS_READ (dr))
3086             dump_printf (MSG_NOTE, "load ");
3087           else if (STMT_VINFO_STRIDED_P (stmt_info))
3088             dump_printf (MSG_NOTE, "strided store ");
3089           else
3090             dump_printf (MSG_NOTE, "store ");
3091           dump_printf (MSG_NOTE, "of size %u\n",
3092                        (unsigned)groupsize);
3093           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3094           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3095           while (next)
3096             {
3097               if (DR_GROUP_GAP (next) != 1)
3098                 dump_printf_loc (MSG_NOTE, vect_location,
3099                                  "\t<gap of %d elements>\n",
3100                                  DR_GROUP_GAP (next) - 1);
3101               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3102               next = DR_GROUP_NEXT_ELEMENT (next);
3103             }
3104           if (DR_GROUP_GAP (stmt_info) != 0)
3105             dump_printf_loc (MSG_NOTE, vect_location,
3106                              "\t<gap of %d elements>\n",
3107                              DR_GROUP_GAP (stmt_info));
3108         }
3109
3110       /* SLP: create an SLP data structure for every interleaving group of
3111          stores for further analysis in vect_analyse_slp.  */
3112       if (DR_IS_WRITE (dr) && !slp_impossible)
3113         {
3114           if (loop_vinfo)
3115             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3116           if (bb_vinfo)
3117             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3118         }
3119     }
3120
3121   return true;
3122 }
3123
3124 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3125    accesses of legal size, step, etc.  Detect gaps, single element
3126    interleaving, and other special cases. Set grouped access info.
3127    Collect groups of strided stores for further use in SLP analysis.  */
3128
3129 static bool
3130 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3131 {
3132   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3133     {
3134       /* Dissolve the group if present.  */
3135       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3136       while (stmt_info)
3137         {
3138           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3139           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3140           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3141           stmt_info = next;
3142         }
3143       return false;
3144     }
3145   return true;
3146 }
3147
3148 /* Analyze the access pattern of the data-reference DR_INFO.
3149    In case of non-consecutive accesses call vect_analyze_group_access() to
3150    analyze groups of accesses.  */
3151
3152 static bool
3153 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3154 {
3155   data_reference *dr = dr_info->dr;
3156   tree step = DR_STEP (dr);
3157   tree scalar_type = TREE_TYPE (DR_REF (dr));
3158   stmt_vec_info stmt_info = dr_info->stmt;
3159   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3160   class loop *loop = NULL;
3161
3162   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3163     return true;
3164
3165   if (loop_vinfo)
3166     loop = LOOP_VINFO_LOOP (loop_vinfo);
3167
3168   if (loop_vinfo && !step)
3169     {
3170       if (dump_enabled_p ())
3171         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3172                          "bad data-ref access in loop\n");
3173       return false;
3174     }
3175
3176   /* Allow loads with zero step in inner-loop vectorization.  */
3177   if (loop_vinfo && integer_zerop (step))
3178     {
3179       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3180       if (!nested_in_vect_loop_p (loop, stmt_info))
3181         return DR_IS_READ (dr);
3182       /* Allow references with zero step for outer loops marked
3183          with pragma omp simd only - it guarantees absence of
3184          loop-carried dependencies between inner loop iterations.  */
3185       if (loop->safelen < 2)
3186         {
3187           if (dump_enabled_p ())
3188             dump_printf_loc (MSG_NOTE, vect_location,
3189                              "zero step in inner loop of nest\n");
3190           return false;
3191         }
3192     }
3193
3194   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3195     {
3196       /* Interleaved accesses are not yet supported within outer-loop
3197         vectorization for references in the inner-loop.  */
3198       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3199
3200       /* For the rest of the analysis we use the outer-loop step.  */
3201       step = STMT_VINFO_DR_STEP (stmt_info);
3202       if (integer_zerop (step))
3203         {
3204           if (dump_enabled_p ())
3205             dump_printf_loc (MSG_NOTE, vect_location,
3206                              "zero step in outer loop.\n");
3207           return DR_IS_READ (dr);
3208         }
3209     }
3210
3211   /* Consecutive?  */
3212   if (TREE_CODE (step) == INTEGER_CST)
3213     {
3214       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3215       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3216           || (dr_step < 0
3217               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3218         {
3219           /* Mark that it is not interleaving.  */
3220           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3221           return true;
3222         }
3223     }
3224
3225   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3226     {
3227       if (dump_enabled_p ())
3228         dump_printf_loc (MSG_NOTE, vect_location,
3229                          "grouped access in outer loop.\n");
3230       return false;
3231     }
3232
3233
3234   /* Assume this is a DR handled by non-constant strided load case.  */
3235   if (TREE_CODE (step) != INTEGER_CST)
3236     return (STMT_VINFO_STRIDED_P (stmt_info)
3237             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3238                 || vect_analyze_group_access (vinfo, dr_info)));
3239
3240   /* Not consecutive access - check if it's a part of interleaving group.  */
3241   return vect_analyze_group_access (vinfo, dr_info);
3242 }
3243
3244 /* Compare two data-references DRA and DRB to group them into chunks
3245    suitable for grouping.  */
3246
3247 static int
3248 dr_group_sort_cmp (const void *dra_, const void *drb_)
3249 {
3250   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3251   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3252   data_reference_p dra = dra_info->dr;
3253   data_reference_p drb = drb_info->dr;
3254   int cmp;
3255
3256   /* Stabilize sort.  */
3257   if (dra == drb)
3258     return 0;
3259
3260   /* Different group IDs lead never belong to the same group.  */
3261   if (dra_info->group != drb_info->group)
3262     return dra_info->group < drb_info->group ? -1 : 1;
3263
3264   /* Ordering of DRs according to base.  */
3265   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3266                                DR_BASE_ADDRESS (drb));
3267   if (cmp != 0)
3268     return cmp;
3269
3270   /* And according to DR_OFFSET.  */
3271   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3272   if (cmp != 0)
3273     return cmp;
3274
3275   /* Put reads before writes.  */
3276   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3277     return DR_IS_READ (dra) ? -1 : 1;
3278
3279   /* Then sort after access size.  */
3280   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3281                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3282   if (cmp != 0)
3283     return cmp;
3284
3285   /* And after step.  */
3286   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3287   if (cmp != 0)
3288     return cmp;
3289
3290   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3291   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3292   if (cmp == 0)
3293     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3294   return cmp;
3295 }
3296
3297 /* If OP is the result of a conversion, return the unconverted value,
3298    otherwise return null.  */
3299
3300 static tree
3301 strip_conversion (tree op)
3302 {
3303   if (TREE_CODE (op) != SSA_NAME)
3304     return NULL_TREE;
3305   gimple *stmt = SSA_NAME_DEF_STMT (op);
3306   if (!is_gimple_assign (stmt)
3307       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3308     return NULL_TREE;
3309   return gimple_assign_rhs1 (stmt);
3310 }
3311
3312 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3313    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3314    be grouped in SLP mode.  */
3315
3316 static bool
3317 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3318                    bool allow_slp_p)
3319 {
3320   if (gimple_assign_single_p (stmt1_info->stmt))
3321     return gimple_assign_single_p (stmt2_info->stmt);
3322
3323   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3324   if (call1 && gimple_call_internal_p (call1))
3325     {
3326       /* Check for two masked loads or two masked stores.  */
3327       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3328       if (!call2 || !gimple_call_internal_p (call2))
3329         return false;
3330       internal_fn ifn = gimple_call_internal_fn (call1);
3331       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3332         return false;
3333       if (ifn != gimple_call_internal_fn (call2))
3334         return false;
3335
3336       /* Check that the masks are the same.  Cope with casts of masks,
3337          like those created by build_mask_conversion.  */
3338       tree mask1 = gimple_call_arg (call1, 2);
3339       tree mask2 = gimple_call_arg (call2, 2);
3340       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3341         {
3342           mask1 = strip_conversion (mask1);
3343           if (!mask1)
3344             return false;
3345           mask2 = strip_conversion (mask2);
3346           if (!mask2)
3347             return false;
3348           if (!operand_equal_p (mask1, mask2, 0))
3349             return false;
3350         }
3351       return true;
3352     }
3353
3354   return false;
3355 }
3356
3357 /* Function vect_analyze_data_ref_accesses.
3358
3359    Analyze the access pattern of all the data references in the loop.
3360
3361    FORNOW: the only access pattern that is considered vectorizable is a
3362            simple step 1 (consecutive) access.
3363
3364    FORNOW: handle only arrays and pointer accesses.  */
3365
3366 opt_result
3367 vect_analyze_data_ref_accesses (vec_info *vinfo,
3368                                 vec<int> *dataref_groups)
3369 {
3370   unsigned int i;
3371   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3372
3373   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3374
3375   if (datarefs.is_empty ())
3376     return opt_result::success ();
3377
3378   /* Sort the array of datarefs to make building the interleaving chains
3379      linear.  Don't modify the original vector's order, it is needed for
3380      determining what dependencies are reversed.  */
3381   vec<dr_vec_info *> datarefs_copy;
3382   datarefs_copy.create (datarefs.length ());
3383   for (unsigned i = 0; i < datarefs.length (); i++)
3384     {
3385       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3386       /* If the caller computed DR grouping use that, otherwise group by
3387          basic blocks.  */
3388       if (dataref_groups)
3389         dr_info->group = (*dataref_groups)[i];
3390       else
3391         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3392       datarefs_copy.quick_push (dr_info);
3393     }
3394   datarefs_copy.qsort (dr_group_sort_cmp);
3395   hash_set<stmt_vec_info> to_fixup;
3396
3397   /* Build the interleaving chains.  */
3398   for (i = 0; i < datarefs_copy.length () - 1;)
3399     {
3400       dr_vec_info *dr_info_a = datarefs_copy[i];
3401       data_reference_p dra = dr_info_a->dr;
3402       int dra_group_id = dr_info_a->group;
3403       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3404       stmt_vec_info lastinfo = NULL;
3405       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3406           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3407         {
3408           ++i;
3409           continue;
3410         }
3411       for (i = i + 1; i < datarefs_copy.length (); ++i)
3412         {
3413           dr_vec_info *dr_info_b = datarefs_copy[i];
3414           data_reference_p drb = dr_info_b->dr;
3415           int drb_group_id = dr_info_b->group;
3416           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3417           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3418               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3419             break;
3420
3421           /* ???  Imperfect sorting (non-compatible types, non-modulo
3422              accesses, same accesses) can lead to a group to be artificially
3423              split here as we don't just skip over those.  If it really
3424              matters we can push those to a worklist and re-iterate
3425              over them.  The we can just skip ahead to the next DR here.  */
3426
3427           /* DRs in a different DR group should not be put into the same
3428              interleaving group.  */
3429           if (dra_group_id != drb_group_id)
3430             break;
3431
3432           /* Check that the data-refs have same first location (except init)
3433              and they are both either store or load (not load and store,
3434              not masked loads or stores).  */
3435           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3436               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3437                                         DR_BASE_ADDRESS (drb)) != 0
3438               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3439               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3440             break;
3441
3442           /* Check that the data-refs have the same constant size.  */
3443           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3444           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3445           if (!tree_fits_uhwi_p (sza)
3446               || !tree_fits_uhwi_p (szb)
3447               || !tree_int_cst_equal (sza, szb))
3448             break;
3449
3450           /* Check that the data-refs have the same step.  */
3451           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3452             break;
3453
3454           /* Check the types are compatible.
3455              ???  We don't distinguish this during sorting.  */
3456           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3457                                    TREE_TYPE (DR_REF (drb))))
3458             break;
3459
3460           /* Check that the DR_INITs are compile-time constants.  */
3461           if (!tree_fits_shwi_p (DR_INIT (dra))
3462               || !tree_fits_shwi_p (DR_INIT (drb)))
3463             break;
3464
3465           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3466              just hold extra information.  */
3467           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3468               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3469               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3470             break;
3471
3472           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3473           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3474           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3475           HOST_WIDE_INT init_prev
3476             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3477           gcc_assert (init_a <= init_b
3478                       && init_a <= init_prev
3479                       && init_prev <= init_b);
3480
3481           /* Do not place the same access in the interleaving chain twice.  */
3482           if (init_b == init_prev)
3483             {
3484               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3485                           < gimple_uid (DR_STMT (drb)));
3486               /* Simply link in duplicates and fix up the chain below.  */
3487             }
3488           else
3489             {
3490               /* If init_b == init_a + the size of the type * k, we have an
3491                  interleaving, and DRA is accessed before DRB.  */
3492               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3493               if (type_size_a == 0
3494                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3495                       % type_size_a != 0))
3496                 break;
3497
3498               /* If we have a store, the accesses are adjacent.  This splits
3499                  groups into chunks we support (we don't support vectorization
3500                  of stores with gaps).  */
3501               if (!DR_IS_READ (dra)
3502                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3503                       != type_size_a))
3504                 break;
3505
3506               /* If the step (if not zero or non-constant) is smaller than the
3507                  difference between data-refs' inits this splits groups into
3508                  suitable sizes.  */
3509               if (tree_fits_shwi_p (DR_STEP (dra)))
3510                 {
3511                   unsigned HOST_WIDE_INT step
3512                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3513                   if (step != 0
3514                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3515                     break;
3516                 }
3517             }
3518
3519           if (dump_enabled_p ())
3520             dump_printf_loc (MSG_NOTE, vect_location,
3521                              DR_IS_READ (dra)
3522                              ? "Detected interleaving load %T and %T\n"
3523                              : "Detected interleaving store %T and %T\n",
3524                              DR_REF (dra), DR_REF (drb));
3525
3526           /* Link the found element into the group list.  */
3527           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3528             {
3529               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3530               lastinfo = stmtinfo_a;
3531             }
3532           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3533           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3534           lastinfo = stmtinfo_b;
3535
3536           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3537             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3538
3539           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3540             dump_printf_loc (MSG_NOTE, vect_location,
3541                              "Load suitable for SLP vectorization only.\n");
3542
3543           if (init_b == init_prev
3544               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3545               && dump_enabled_p ())
3546             dump_printf_loc (MSG_NOTE, vect_location,
3547                              "Queuing group with duplicate access for fixup\n");
3548         }
3549     }
3550
3551   /* Fixup groups with duplicate entries by splitting it.  */
3552   while (1)
3553     {
3554       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3555       if (!(it != to_fixup.end ()))
3556         break;
3557       stmt_vec_info grp = *it;
3558       to_fixup.remove (grp);
3559
3560       /* Find the earliest duplicate group member.  */
3561       unsigned first_duplicate = -1u;
3562       stmt_vec_info next, g = grp;
3563       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3564         {
3565           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3566                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3567               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3568             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3569           g = next;
3570         }
3571       if (first_duplicate == -1U)
3572         continue;
3573
3574       /* Then move all stmts after the first duplicate to a new group.
3575          Note this is a heuristic but one with the property that *it
3576          is fixed up completely.  */
3577       g = grp;
3578       stmt_vec_info newgroup = NULL, ng = grp;
3579       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3580         {
3581           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3582             {
3583               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3584               if (!newgroup)
3585                 newgroup = next;
3586               else
3587                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3588               ng = next;
3589               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3590             }
3591           else
3592             g = DR_GROUP_NEXT_ELEMENT (g);
3593         }
3594       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3595
3596       /* Fixup the new group which still may contain duplicates.  */
3597       to_fixup.add (newgroup);
3598     }
3599
3600   dr_vec_info *dr_info;
3601   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3602     {
3603       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3604           && !vect_analyze_data_ref_access (vinfo, dr_info))
3605         {
3606           if (dump_enabled_p ())
3607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3608                              "not vectorized: complicated access pattern.\n");
3609
3610           if (is_a <bb_vec_info> (vinfo))
3611             {
3612               /* Mark the statement as not vectorizable.  */
3613               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3614               continue;
3615             }
3616           else
3617             {
3618               datarefs_copy.release ();
3619               return opt_result::failure_at (dr_info->stmt->stmt,
3620                                              "not vectorized:"
3621                                              " complicated access pattern.\n");
3622             }
3623         }
3624     }
3625
3626   datarefs_copy.release ();
3627   return opt_result::success ();
3628 }
3629
3630 /* Function vect_vfa_segment_size.
3631
3632    Input:
3633      DR_INFO: The data reference.
3634      LENGTH_FACTOR: segment length to consider.
3635
3636    Return a value suitable for the dr_with_seg_len::seg_len field.
3637    This is the "distance travelled" by the pointer from the first
3638    iteration in the segment to the last.  Note that it does not include
3639    the size of the access; in effect it only describes the first byte.  */
3640
3641 static tree
3642 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3643 {
3644   length_factor = size_binop (MINUS_EXPR,
3645                               fold_convert (sizetype, length_factor),
3646                               size_one_node);
3647   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3648                      length_factor);
3649 }
3650
3651 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3652    gives the worst-case number of bytes covered by the segment.  */
3653
3654 static unsigned HOST_WIDE_INT
3655 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3656 {
3657   stmt_vec_info stmt_vinfo = dr_info->stmt;
3658   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3659   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3660   unsigned HOST_WIDE_INT access_size = ref_size;
3661   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3662     {
3663       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3664       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3665     }
3666   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3667   int misalignment;
3668   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3669       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3670       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3671           == dr_explicit_realign_optimized))
3672     {
3673       /* We might access a full vector's worth.  */
3674       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3675     }
3676   return access_size;
3677 }
3678
3679 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3680    describes.  */
3681
3682 static unsigned int
3683 vect_vfa_align (dr_vec_info *dr_info)
3684 {
3685   return dr_alignment (dr_info->dr);
3686 }
3687
3688 /* Function vect_no_alias_p.
3689
3690    Given data references A and B with equal base and offset, see whether
3691    the alias relation can be decided at compilation time.  Return 1 if
3692    it can and the references alias, 0 if it can and the references do
3693    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3694    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3695    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3696
3697 static int
3698 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3699                          tree segment_length_a, tree segment_length_b,
3700                          unsigned HOST_WIDE_INT access_size_a,
3701                          unsigned HOST_WIDE_INT access_size_b)
3702 {
3703   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3704   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3705   poly_uint64 const_length_a;
3706   poly_uint64 const_length_b;
3707
3708   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3709      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3710      [a, a+12) */
3711   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3712     {
3713       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3714       offset_a -= const_length_a;
3715     }
3716   else
3717     const_length_a = tree_to_poly_uint64 (segment_length_a);
3718   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3719     {
3720       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3721       offset_b -= const_length_b;
3722     }
3723   else
3724     const_length_b = tree_to_poly_uint64 (segment_length_b);
3725
3726   const_length_a += access_size_a;
3727   const_length_b += access_size_b;
3728
3729   if (ranges_known_overlap_p (offset_a, const_length_a,
3730                               offset_b, const_length_b))
3731     return 1;
3732
3733   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3734                                offset_b, const_length_b))
3735     return 0;
3736
3737   return -1;
3738 }
3739
3740 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3741    in DDR is >= VF.  */
3742
3743 static bool
3744 dependence_distance_ge_vf (data_dependence_relation *ddr,
3745                            unsigned int loop_depth, poly_uint64 vf)
3746 {
3747   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3748       || DDR_NUM_DIST_VECTS (ddr) == 0)
3749     return false;
3750
3751   /* If the dependence is exact, we should have limited the VF instead.  */
3752   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3753
3754   unsigned int i;
3755   lambda_vector dist_v;
3756   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3757     {
3758       HOST_WIDE_INT dist = dist_v[loop_depth];
3759       if (dist != 0
3760           && !(dist > 0 && DDR_REVERSED_P (ddr))
3761           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3762         return false;
3763     }
3764
3765   if (dump_enabled_p ())
3766     dump_printf_loc (MSG_NOTE, vect_location,
3767                      "dependence distance between %T and %T is >= VF\n",
3768                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3769
3770   return true;
3771 }
3772
3773 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3774
3775 static void
3776 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3777 {
3778   dump_printf (dump_kind, "%s (%T) >= ",
3779                lower_bound.unsigned_p ? "unsigned" : "abs",
3780                lower_bound.expr);
3781   dump_dec (dump_kind, lower_bound.min_value);
3782 }
3783
3784 /* Record that the vectorized loop requires the vec_lower_bound described
3785    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3786
3787 static void
3788 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3789                         poly_uint64 min_value)
3790 {
3791   vec<vec_lower_bound> &lower_bounds
3792     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3793   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3794     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3795       {
3796         unsigned_p &= lower_bounds[i].unsigned_p;
3797         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3798         if (lower_bounds[i].unsigned_p != unsigned_p
3799             || maybe_lt (lower_bounds[i].min_value, min_value))
3800           {
3801             lower_bounds[i].unsigned_p = unsigned_p;
3802             lower_bounds[i].min_value = min_value;
3803             if (dump_enabled_p ())
3804               {
3805                 dump_printf_loc (MSG_NOTE, vect_location,
3806                                  "updating run-time check to ");
3807                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3808                 dump_printf (MSG_NOTE, "\n");
3809               }
3810           }
3811         return;
3812       }
3813
3814   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3815   if (dump_enabled_p ())
3816     {
3817       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3818       dump_lower_bound (MSG_NOTE, lower_bound);
3819       dump_printf (MSG_NOTE, "\n");
3820     }
3821   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3822 }
3823
3824 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3825    will span fewer than GAP bytes.  */
3826
3827 static bool
3828 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3829                   poly_int64 gap)
3830 {
3831   stmt_vec_info stmt_info = dr_info->stmt;
3832   HOST_WIDE_INT count
3833     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3834   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3835     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3836   return (estimated_poly_value (gap)
3837           <= count * vect_get_scalar_dr_size (dr_info));
3838 }
3839
3840 /* Return true if we know that there is no alias between DR_INFO_A and
3841    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3842    When returning true, set *LOWER_BOUND_OUT to this N.  */
3843
3844 static bool
3845 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3846                                 poly_uint64 *lower_bound_out)
3847 {
3848   /* Check that there is a constant gap of known sign between DR_A
3849      and DR_B.  */
3850   data_reference *dr_a = dr_info_a->dr;
3851   data_reference *dr_b = dr_info_b->dr;
3852   poly_int64 init_a, init_b;
3853   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3854       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3855       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3856       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3857       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3858       || !ordered_p (init_a, init_b))
3859     return false;
3860
3861   /* Sort DR_A and DR_B by the address they access.  */
3862   if (maybe_lt (init_b, init_a))
3863     {
3864       std::swap (init_a, init_b);
3865       std::swap (dr_info_a, dr_info_b);
3866       std::swap (dr_a, dr_b);
3867     }
3868
3869   /* If the two accesses could be dependent within a scalar iteration,
3870      make sure that we'd retain their order.  */
3871   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3872       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3873     return false;
3874
3875   /* There is no alias if abs (DR_STEP) is greater than or equal to
3876      the bytes spanned by the combination of the two accesses.  */
3877   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3878   return true;
3879 }
3880
3881 /* Function vect_prune_runtime_alias_test_list.
3882
3883    Prune a list of ddrs to be tested at run-time by versioning for alias.
3884    Merge several alias checks into one if possible.
3885    Return FALSE if resulting list of ddrs is longer then allowed by
3886    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3887
3888 opt_result
3889 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3890 {
3891   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3892   hash_set <tree_pair_hash> compared_objects;
3893
3894   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3895   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3896     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3897   const vec<vec_object_pair> &check_unequal_addrs
3898     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3899   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3900   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3901
3902   ddr_p ddr;
3903   unsigned int i;
3904   tree length_factor;
3905
3906   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3907
3908   /* Step values are irrelevant for aliasing if the number of vector
3909      iterations is equal to the number of scalar iterations (which can
3910      happen for fully-SLP loops).  */
3911   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3912
3913   if (!vf_one_p)
3914     {
3915       /* Convert the checks for nonzero steps into bound tests.  */
3916       tree value;
3917       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3918         vect_check_lower_bound (loop_vinfo, value, true, 1);
3919     }
3920
3921   if (may_alias_ddrs.is_empty ())
3922     return opt_result::success ();
3923
3924   comp_alias_ddrs.create (may_alias_ddrs.length ());
3925
3926   unsigned int loop_depth
3927     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3928                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3929
3930   /* First, we collect all data ref pairs for aliasing checks.  */
3931   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3932     {
3933       poly_uint64 lower_bound;
3934       tree segment_length_a, segment_length_b;
3935       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3936       unsigned int align_a, align_b;
3937
3938       /* Ignore the alias if the VF we chose ended up being no greater
3939          than the dependence distance.  */
3940       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3941         continue;
3942
3943       if (DDR_OBJECT_A (ddr))
3944         {
3945           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3946           if (!compared_objects.add (new_pair))
3947             {
3948               if (dump_enabled_p ())
3949                 dump_printf_loc (MSG_NOTE, vect_location,
3950                                  "checking that %T and %T"
3951                                  " have different addresses\n",
3952                                  new_pair.first, new_pair.second);
3953               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3954             }
3955           continue;
3956         }
3957
3958       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3959       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3960
3961       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3962       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3963
3964       bool preserves_scalar_order_p
3965         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3966       bool ignore_step_p
3967           = (vf_one_p
3968              && (preserves_scalar_order_p
3969                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3970                                      DR_STEP (dr_info_b->dr))));
3971
3972       /* Skip the pair if inter-iteration dependencies are irrelevant
3973          and intra-iteration dependencies are guaranteed to be honored.  */
3974       if (ignore_step_p
3975           && (preserves_scalar_order_p
3976               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3977                                                  &lower_bound)))
3978         {
3979           if (dump_enabled_p ())
3980             dump_printf_loc (MSG_NOTE, vect_location,
3981                              "no need for alias check between "
3982                              "%T and %T when VF is 1\n",
3983                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3984           continue;
3985         }
3986
3987       /* See whether we can handle the alias using a bounds check on
3988          the step, and whether that's likely to be the best approach.
3989          (It might not be, for example, if the minimum step is much larger
3990          than the number of bytes handled by one vector iteration.)  */
3991       if (!ignore_step_p
3992           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3993           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3994                                              &lower_bound)
3995           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3996               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3997         {
3998           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3999           if (dump_enabled_p ())
4000             {
4001               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4002                                "%T and %T when the step %T is outside ",
4003                                DR_REF (dr_info_a->dr),
4004                                DR_REF (dr_info_b->dr),
4005                                DR_STEP (dr_info_a->dr));
4006               if (unsigned_p)
4007                 dump_printf (MSG_NOTE, "[0");
4008               else
4009                 {
4010                   dump_printf (MSG_NOTE, "(");
4011                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4012                 }
4013               dump_printf (MSG_NOTE, ", ");
4014               dump_dec (MSG_NOTE, lower_bound);
4015               dump_printf (MSG_NOTE, ")\n");
4016             }
4017           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4018                                   unsigned_p, lower_bound);
4019           continue;
4020         }
4021
4022       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4023       if (dr_group_first_a)
4024         {
4025           stmt_info_a = dr_group_first_a;
4026           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4027         }
4028
4029       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4030       if (dr_group_first_b)
4031         {
4032           stmt_info_b = dr_group_first_b;
4033           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4034         }
4035
4036       if (ignore_step_p)
4037         {
4038           segment_length_a = size_zero_node;
4039           segment_length_b = size_zero_node;
4040         }
4041       else
4042         {
4043           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4044                                 DR_STEP (dr_info_b->dr), 0))
4045             length_factor = scalar_loop_iters;
4046           else
4047             length_factor = size_int (vect_factor);
4048           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4049           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4050         }
4051       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4052       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4053       align_a = vect_vfa_align (dr_info_a);
4054       align_b = vect_vfa_align (dr_info_b);
4055
4056       /* See whether the alias is known at compilation time.  */
4057       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4058                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4059           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4060                               DR_OFFSET (dr_info_b->dr), 0)
4061           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4062           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4063           && poly_int_tree_p (segment_length_a)
4064           && poly_int_tree_p (segment_length_b))
4065         {
4066           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4067                                              segment_length_a,
4068                                              segment_length_b,
4069                                              access_size_a,
4070                                              access_size_b);
4071           if (res >= 0 && dump_enabled_p ())
4072             {
4073               dump_printf_loc (MSG_NOTE, vect_location,
4074                                "can tell at compile time that %T and %T",
4075                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4076               if (res == 0)
4077                 dump_printf (MSG_NOTE, " do not alias\n");
4078               else
4079                 dump_printf (MSG_NOTE, " alias\n");
4080             }
4081
4082           if (res == 0)
4083             continue;
4084
4085           if (res == 1)
4086             return opt_result::failure_at (stmt_info_b->stmt,
4087                                            "not vectorized:"
4088                                            " compilation time alias: %G%G",
4089                                            stmt_info_a->stmt,
4090                                            stmt_info_b->stmt);
4091         }
4092
4093       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4094                             access_size_a, align_a);
4095       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4096                             access_size_b, align_b);
4097       /* Canonicalize the order to be the one that's needed for accurate
4098          RAW, WAR and WAW flags, in cases where the data references are
4099          well-ordered.  The order doesn't really matter otherwise,
4100          but we might as well be consistent.  */
4101       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4102         std::swap (dr_a, dr_b);
4103
4104       dr_with_seg_len_pair_t dr_with_seg_len_pair
4105         (dr_a, dr_b, (preserves_scalar_order_p
4106                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4107                       : dr_with_seg_len_pair_t::REORDERED));
4108
4109       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4110     }
4111
4112   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4113
4114   unsigned int count = (comp_alias_ddrs.length ()
4115                         + check_unequal_addrs.length ());
4116
4117   if (count
4118       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4119           == VECT_COST_MODEL_VERY_CHEAP))
4120     return opt_result::failure_at
4121       (vect_location, "would need a runtime alias check\n");
4122
4123   if (dump_enabled_p ())
4124     dump_printf_loc (MSG_NOTE, vect_location,
4125                      "improved number of alias checks from %d to %d\n",
4126                      may_alias_ddrs.length (), count);
4127   unsigned limit = param_vect_max_version_for_alias_checks;
4128   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4129     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4130   if (count > limit)
4131     return opt_result::failure_at
4132       (vect_location,
4133        "number of versioning for alias run-time tests exceeds %d "
4134        "(--param vect-max-version-for-alias-checks)\n", limit);
4135
4136   return opt_result::success ();
4137 }
4138
4139 /* Check whether we can use an internal function for a gather load
4140    or scatter store.  READ_P is true for loads and false for stores.
4141    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4142    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4143    is the type of the offset that is being applied to the invariant
4144    base address.  SCALE is the amount by which the offset should
4145    be multiplied *after* it has been converted to address width.
4146
4147    Return true if the function is supported, storing the function id in
4148    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4149
4150 bool
4151 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4152                           tree vectype, tree memory_type, tree offset_type,
4153                           int scale, internal_fn *ifn_out,
4154                           tree *offset_vectype_out)
4155 {
4156   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4157   unsigned int element_bits = vector_element_bits (vectype);
4158   if (element_bits != memory_bits)
4159     /* For now the vector elements must be the same width as the
4160        memory elements.  */
4161     return false;
4162
4163   /* Work out which function we need.  */
4164   internal_fn ifn, alt_ifn, alt_ifn2;
4165   if (read_p)
4166     {
4167       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4168       alt_ifn = IFN_MASK_GATHER_LOAD;
4169       /* When target supports MASK_LEN_GATHER_LOAD, we always
4170          use MASK_LEN_GATHER_LOAD regardless whether len and
4171          mask are valid or not.  */
4172       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4173     }
4174   else
4175     {
4176       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4177       alt_ifn = IFN_MASK_SCATTER_STORE;
4178       /* When target supports MASK_LEN_SCATTER_STORE, we always
4179          use MASK_LEN_SCATTER_STORE regardless whether len and
4180          mask are valid or not.  */
4181       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4182     }
4183
4184   for (;;)
4185     {
4186       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4187       if (!offset_vectype)
4188         return false;
4189
4190       /* Test whether the target supports this combination.  */
4191       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4192                                                   offset_vectype, scale))
4193         {
4194           *ifn_out = ifn;
4195           *offset_vectype_out = offset_vectype;
4196           return true;
4197         }
4198       else if (!masked_p
4199                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4200                                                           memory_type,
4201                                                           offset_vectype,
4202                                                           scale))
4203         {
4204           *ifn_out = alt_ifn;
4205           *offset_vectype_out = offset_vectype;
4206           return true;
4207         }
4208       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4209                                                        memory_type,
4210                                                        offset_vectype, scale))
4211         {
4212           *ifn_out = alt_ifn2;
4213           *offset_vectype_out = offset_vectype;
4214           return true;
4215         }
4216
4217       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4218           && TYPE_PRECISION (offset_type) >= element_bits)
4219         return false;
4220
4221       offset_type = build_nonstandard_integer_type
4222         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4223     }
4224 }
4225
4226 /* STMT_INFO is a call to an internal gather load or scatter store function.
4227    Describe the operation in INFO.  */
4228
4229 static void
4230 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4231                                    gather_scatter_info *info)
4232 {
4233   gcall *call = as_a <gcall *> (stmt_info->stmt);
4234   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4235   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4236
4237   info->ifn = gimple_call_internal_fn (call);
4238   info->decl = NULL_TREE;
4239   info->base = gimple_call_arg (call, 0);
4240   info->offset = gimple_call_arg (call, 1);
4241   info->offset_dt = vect_unknown_def_type;
4242   info->offset_vectype = NULL_TREE;
4243   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4244   info->element_type = TREE_TYPE (vectype);
4245   info->memory_type = TREE_TYPE (DR_REF (dr));
4246 }
4247
4248 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4249    gather load or scatter store.  Describe the operation in *INFO if so.  */
4250
4251 bool
4252 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4253                            gather_scatter_info *info)
4254 {
4255   HOST_WIDE_INT scale = 1;
4256   poly_int64 pbitpos, pbitsize;
4257   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4258   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4259   tree offtype = NULL_TREE;
4260   tree decl = NULL_TREE, base, off;
4261   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4262   tree memory_type = TREE_TYPE (DR_REF (dr));
4263   machine_mode pmode;
4264   int punsignedp, reversep, pvolatilep = 0;
4265   internal_fn ifn;
4266   tree offset_vectype;
4267   bool masked_p = false;
4268
4269   /* See whether this is already a call to a gather/scatter internal function.
4270      If not, see whether it's a masked load or store.  */
4271   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4272   if (call && gimple_call_internal_p (call))
4273     {
4274       ifn = gimple_call_internal_fn (call);
4275       if (internal_gather_scatter_fn_p (ifn))
4276         {
4277           vect_describe_gather_scatter_call (stmt_info, info);
4278           return true;
4279         }
4280       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4281     }
4282
4283   /* True if we should aim to use internal functions rather than
4284      built-in functions.  */
4285   bool use_ifn_p = (DR_IS_READ (dr)
4286                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4287                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4288
4289   base = DR_REF (dr);
4290   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4291      see if we can use the def stmt of the address.  */
4292   if (masked_p
4293       && TREE_CODE (base) == MEM_REF
4294       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4295       && integer_zerop (TREE_OPERAND (base, 1))
4296       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4297     {
4298       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4299       if (is_gimple_assign (def_stmt)
4300           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4301         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4302     }
4303
4304   /* The gather and scatter builtins need address of the form
4305      loop_invariant + vector * {1, 2, 4, 8}
4306      or
4307      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4308      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4309      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4310      multiplications and additions in it.  To get a vector, we need
4311      a single SSA_NAME that will be defined in the loop and will
4312      contain everything that is not loop invariant and that can be
4313      vectorized.  The following code attempts to find such a preexistng
4314      SSA_NAME OFF and put the loop invariants into a tree BASE
4315      that can be gimplified before the loop.  */
4316   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4317                               &punsignedp, &reversep, &pvolatilep);
4318   if (reversep)
4319     return false;
4320
4321   /* PR 107346.  Packed structs can have fields at offsets that are not
4322      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4323   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4324     return false;
4325
4326   /* We need to be able to form an address to the base which for example
4327      isn't possible for hard registers.  */
4328   if (may_be_nonaddressable_p (base))
4329     return false;
4330
4331   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4332
4333   if (TREE_CODE (base) == MEM_REF)
4334     {
4335       if (!integer_zerop (TREE_OPERAND (base, 1)))
4336         {
4337           if (off == NULL_TREE)
4338             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4339           else
4340             off = size_binop (PLUS_EXPR, off,
4341                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4342         }
4343       base = TREE_OPERAND (base, 0);
4344     }
4345   else
4346     base = build_fold_addr_expr (base);
4347
4348   if (off == NULL_TREE)
4349     off = size_zero_node;
4350
4351   /* If base is not loop invariant, either off is 0, then we start with just
4352      the constant offset in the loop invariant BASE and continue with base
4353      as OFF, otherwise give up.
4354      We could handle that case by gimplifying the addition of base + off
4355      into some SSA_NAME and use that as off, but for now punt.  */
4356   if (!expr_invariant_in_loop_p (loop, base))
4357     {
4358       if (!integer_zerop (off))
4359         return false;
4360       off = base;
4361       base = size_int (pbytepos);
4362     }
4363   /* Otherwise put base + constant offset into the loop invariant BASE
4364      and continue with OFF.  */
4365   else
4366     {
4367       base = fold_convert (sizetype, base);
4368       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4369     }
4370
4371   /* OFF at this point may be either a SSA_NAME or some tree expression
4372      from get_inner_reference.  Try to peel off loop invariants from it
4373      into BASE as long as possible.  */
4374   STRIP_NOPS (off);
4375   while (offtype == NULL_TREE)
4376     {
4377       enum tree_code code;
4378       tree op0, op1, add = NULL_TREE;
4379
4380       if (TREE_CODE (off) == SSA_NAME)
4381         {
4382           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4383
4384           if (expr_invariant_in_loop_p (loop, off))
4385             return false;
4386
4387           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4388             break;
4389
4390           op0 = gimple_assign_rhs1 (def_stmt);
4391           code = gimple_assign_rhs_code (def_stmt);
4392           op1 = gimple_assign_rhs2 (def_stmt);
4393         }
4394       else
4395         {
4396           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4397             return false;
4398           code = TREE_CODE (off);
4399           extract_ops_from_tree (off, &code, &op0, &op1);
4400         }
4401       switch (code)
4402         {
4403         case POINTER_PLUS_EXPR:
4404         case PLUS_EXPR:
4405           if (expr_invariant_in_loop_p (loop, op0))
4406             {
4407               add = op0;
4408               off = op1;
4409             do_add:
4410               add = fold_convert (sizetype, add);
4411               if (scale != 1)
4412                 add = size_binop (MULT_EXPR, add, size_int (scale));
4413               base = size_binop (PLUS_EXPR, base, add);
4414               continue;
4415             }
4416           if (expr_invariant_in_loop_p (loop, op1))
4417             {
4418               add = op1;
4419               off = op0;
4420               goto do_add;
4421             }
4422           break;
4423         case MINUS_EXPR:
4424           if (expr_invariant_in_loop_p (loop, op1))
4425             {
4426               add = fold_convert (sizetype, op1);
4427               add = size_binop (MINUS_EXPR, size_zero_node, add);
4428               off = op0;
4429               goto do_add;
4430             }
4431           break;
4432         case MULT_EXPR:
4433           if (scale == 1 && tree_fits_shwi_p (op1))
4434             {
4435               int new_scale = tree_to_shwi (op1);
4436               /* Only treat this as a scaling operation if the target
4437                  supports it for at least some offset type.  */
4438               if (use_ifn_p
4439                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4440                                                 masked_p, vectype, memory_type,
4441                                                 signed_char_type_node,
4442                                                 new_scale, &ifn,
4443                                                 &offset_vectype)
4444                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4445                                                 masked_p, vectype, memory_type,
4446                                                 unsigned_char_type_node,
4447                                                 new_scale, &ifn,
4448                                                 &offset_vectype))
4449                 break;
4450               scale = new_scale;
4451               off = op0;
4452               continue;
4453             }
4454           break;
4455         case SSA_NAME:
4456           off = op0;
4457           continue;
4458         CASE_CONVERT:
4459           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4460               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4461             break;
4462
4463           /* Don't include the conversion if the target is happy with
4464              the current offset type.  */
4465           if (use_ifn_p
4466               && TREE_CODE (off) == SSA_NAME
4467               && !POINTER_TYPE_P (TREE_TYPE (off))
4468               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4469                                            masked_p, vectype, memory_type,
4470                                            TREE_TYPE (off), scale, &ifn,
4471                                            &offset_vectype))
4472             break;
4473
4474           if (TYPE_PRECISION (TREE_TYPE (op0))
4475               == TYPE_PRECISION (TREE_TYPE (off)))
4476             {
4477               off = op0;
4478               continue;
4479             }
4480
4481           /* Include the conversion if it is widening and we're using
4482              the IFN path or the target can handle the converted from
4483              offset or the current size is not already the same as the
4484              data vector element size.  */
4485           if ((TYPE_PRECISION (TREE_TYPE (op0))
4486                < TYPE_PRECISION (TREE_TYPE (off)))
4487               && (use_ifn_p
4488                   || (DR_IS_READ (dr)
4489                       ? (targetm.vectorize.builtin_gather
4490                          && targetm.vectorize.builtin_gather (vectype,
4491                                                               TREE_TYPE (op0),
4492                                                               scale))
4493                       : (targetm.vectorize.builtin_scatter
4494                          && targetm.vectorize.builtin_scatter (vectype,
4495                                                                TREE_TYPE (op0),
4496                                                                scale)))
4497                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4498                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4499             {
4500               off = op0;
4501               offtype = TREE_TYPE (off);
4502               STRIP_NOPS (off);
4503               continue;
4504             }
4505           break;
4506         default:
4507           break;
4508         }
4509       break;
4510     }
4511
4512   /* If at the end OFF still isn't a SSA_NAME or isn't
4513      defined in the loop, punt.  */
4514   if (TREE_CODE (off) != SSA_NAME
4515       || expr_invariant_in_loop_p (loop, off))
4516     return false;
4517
4518   if (offtype == NULL_TREE)
4519     offtype = TREE_TYPE (off);
4520
4521   if (use_ifn_p)
4522     {
4523       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4524                                      vectype, memory_type, offtype, scale,
4525                                      &ifn, &offset_vectype))
4526         ifn = IFN_LAST;
4527       decl = NULL_TREE;
4528     }
4529   else
4530     {
4531       if (DR_IS_READ (dr))
4532         {
4533           if (targetm.vectorize.builtin_gather)
4534             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4535         }
4536       else
4537         {
4538           if (targetm.vectorize.builtin_scatter)
4539             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4540         }
4541       ifn = IFN_LAST;
4542       /* The offset vector type will be read from DECL when needed.  */
4543       offset_vectype = NULL_TREE;
4544     }
4545
4546   info->ifn = ifn;
4547   info->decl = decl;
4548   info->base = base;
4549   info->offset = off;
4550   info->offset_dt = vect_unknown_def_type;
4551   info->offset_vectype = offset_vectype;
4552   info->scale = scale;
4553   info->element_type = TREE_TYPE (vectype);
4554   info->memory_type = memory_type;
4555   return true;
4556 }
4557
4558 /* Find the data references in STMT, analyze them with respect to LOOP and
4559    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4560    be handled.  */
4561
4562 opt_result
4563 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4564                                vec<data_reference_p> *datarefs,
4565                                vec<int> *dataref_groups, int group_id)
4566 {
4567   /* We can ignore clobbers for dataref analysis - they are removed during
4568      loop vectorization and BB vectorization checks dependences with a
4569      stmt walk.  */
4570   if (gimple_clobber_p (stmt))
4571     return opt_result::success ();
4572
4573   if (gimple_has_volatile_ops (stmt))
4574     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4575                                    stmt);
4576
4577   if (stmt_can_throw_internal (cfun, stmt))
4578     return opt_result::failure_at (stmt,
4579                                    "not vectorized:"
4580                                    " statement can throw an exception: %G",
4581                                    stmt);
4582
4583   auto_vec<data_reference_p, 2> refs;
4584   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4585   if (!res)
4586     return res;
4587
4588   if (refs.is_empty ())
4589     return opt_result::success ();
4590
4591   if (refs.length () > 1)
4592     {
4593       while (!refs.is_empty ())
4594         free_data_ref (refs.pop ());
4595       return opt_result::failure_at (stmt,
4596                                      "not vectorized: more than one "
4597                                      "data ref in stmt: %G", stmt);
4598     }
4599
4600   data_reference_p dr = refs.pop ();
4601   if (gcall *call = dyn_cast <gcall *> (stmt))
4602     if (!gimple_call_internal_p (call)
4603         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4604             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4605       {
4606         free_data_ref (dr);
4607         return opt_result::failure_at (stmt,
4608                                        "not vectorized: dr in a call %G", stmt);
4609       }
4610
4611   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4612       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4613     {
4614       free_data_ref (dr);
4615       return opt_result::failure_at (stmt,
4616                                      "not vectorized:"
4617                                      " statement is an unsupported"
4618                                      " bitfield access %G", stmt);
4619     }
4620
4621   if (DR_BASE_ADDRESS (dr)
4622       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4623     {
4624       free_data_ref (dr);
4625       return opt_result::failure_at (stmt,
4626                                      "not vectorized:"
4627                                      " base addr of dr is a constant\n");
4628     }
4629
4630   /* Check whether this may be a SIMD lane access and adjust the
4631      DR to make it easier for us to handle it.  */
4632   if (loop
4633       && loop->simduid
4634       && (!DR_BASE_ADDRESS (dr)
4635           || !DR_OFFSET (dr)
4636           || !DR_INIT (dr)
4637           || !DR_STEP (dr)))
4638     {
4639       struct data_reference *newdr
4640         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4641                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4642       if (DR_BASE_ADDRESS (newdr)
4643           && DR_OFFSET (newdr)
4644           && DR_INIT (newdr)
4645           && DR_STEP (newdr)
4646           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4647           && integer_zerop (DR_STEP (newdr)))
4648         {
4649           tree base_address = DR_BASE_ADDRESS (newdr);
4650           tree off = DR_OFFSET (newdr);
4651           tree step = ssize_int (1);
4652           if (integer_zerop (off)
4653               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4654             {
4655               off = TREE_OPERAND (base_address, 1);
4656               base_address = TREE_OPERAND (base_address, 0);
4657             }
4658           STRIP_NOPS (off);
4659           if (TREE_CODE (off) == MULT_EXPR
4660               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4661             {
4662               step = TREE_OPERAND (off, 1);
4663               off = TREE_OPERAND (off, 0);
4664               STRIP_NOPS (off);
4665             }
4666           if (CONVERT_EXPR_P (off)
4667               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4668                   < TYPE_PRECISION (TREE_TYPE (off))))
4669             off = TREE_OPERAND (off, 0);
4670           if (TREE_CODE (off) == SSA_NAME)
4671             {
4672               gimple *def = SSA_NAME_DEF_STMT (off);
4673               /* Look through widening conversion.  */
4674               if (is_gimple_assign (def)
4675                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4676                 {
4677                   tree rhs1 = gimple_assign_rhs1 (def);
4678                   if (TREE_CODE (rhs1) == SSA_NAME
4679                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4680                       && (TYPE_PRECISION (TREE_TYPE (off))
4681                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4682                     def = SSA_NAME_DEF_STMT (rhs1);
4683                 }
4684               if (is_gimple_call (def)
4685                   && gimple_call_internal_p (def)
4686                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4687                 {
4688                   tree arg = gimple_call_arg (def, 0);
4689                   tree reft = TREE_TYPE (DR_REF (newdr));
4690                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4691                   arg = SSA_NAME_VAR (arg);
4692                   if (arg == loop->simduid
4693                       /* For now.  */
4694                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4695                     {
4696                       DR_BASE_ADDRESS (newdr) = base_address;
4697                       DR_OFFSET (newdr) = ssize_int (0);
4698                       DR_STEP (newdr) = step;
4699                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4700                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4701                       /* Mark as simd-lane access.  */
4702                       tree arg2 = gimple_call_arg (def, 1);
4703                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4704                       free_data_ref (dr);
4705                       datarefs->safe_push (newdr);
4706                       if (dataref_groups)
4707                         dataref_groups->safe_push (group_id);
4708                       return opt_result::success ();
4709                     }
4710                 }
4711             }
4712         }
4713       free_data_ref (newdr);
4714     }
4715
4716   datarefs->safe_push (dr);
4717   if (dataref_groups)
4718     dataref_groups->safe_push (group_id);
4719   return opt_result::success ();
4720 }
4721
4722 /* Function vect_analyze_data_refs.
4723
4724   Find all the data references in the loop or basic block.
4725
4726    The general structure of the analysis of data refs in the vectorizer is as
4727    follows:
4728    1- vect_analyze_data_refs(loop/bb): call
4729       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4730       in the loop/bb and their dependences.
4731    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4732    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4733    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4734
4735 */
4736
4737 opt_result
4738 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4739 {
4740   class loop *loop = NULL;
4741   unsigned int i;
4742   struct data_reference *dr;
4743   tree scalar_type;
4744
4745   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4746
4747   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4748     loop = LOOP_VINFO_LOOP (loop_vinfo);
4749
4750   /* Go through the data-refs, check that the analysis succeeded.  Update
4751      pointer from stmt_vec_info struct to DR and vectype.  */
4752
4753   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4754   FOR_EACH_VEC_ELT (datarefs, i, dr)
4755     {
4756       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4757       poly_uint64 vf;
4758
4759       gcc_assert (DR_REF (dr));
4760       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4761       gcc_assert (!stmt_info->dr_aux.dr);
4762       stmt_info->dr_aux.dr = dr;
4763       stmt_info->dr_aux.stmt = stmt_info;
4764
4765       /* Check that analysis of the data-ref succeeded.  */
4766       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4767           || !DR_STEP (dr))
4768         {
4769           bool maybe_gather
4770             = DR_IS_READ (dr)
4771               && !TREE_THIS_VOLATILE (DR_REF (dr));
4772           bool maybe_scatter
4773             = DR_IS_WRITE (dr)
4774               && !TREE_THIS_VOLATILE (DR_REF (dr));
4775
4776           /* If target supports vector gather loads or scatter stores,
4777              see if they can't be used.  */
4778           if (is_a <loop_vec_info> (vinfo)
4779               && !nested_in_vect_loop_p (loop, stmt_info))
4780             {
4781               if (maybe_gather || maybe_scatter)
4782                 {
4783                   if (maybe_gather)
4784                     gatherscatter = GATHER;
4785                   else
4786                     gatherscatter = SCATTER;
4787                 }
4788             }
4789
4790           if (gatherscatter == SG_NONE)
4791             {
4792               if (dump_enabled_p ())
4793                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4794                                  "not vectorized: data ref analysis "
4795                                  "failed %G", stmt_info->stmt);
4796               if (is_a <bb_vec_info> (vinfo))
4797                 {
4798                   /* In BB vectorization the ref can still participate
4799                      in dependence analysis, we just can't vectorize it.  */
4800                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4801                   continue;
4802                 }
4803               return opt_result::failure_at (stmt_info->stmt,
4804                                              "not vectorized:"
4805                                              " data ref analysis failed: %G",
4806                                              stmt_info->stmt);
4807             }
4808         }
4809
4810       /* See if this was detected as SIMD lane access.  */
4811       if (dr->aux == (void *)-1
4812           || dr->aux == (void *)-2
4813           || dr->aux == (void *)-3
4814           || dr->aux == (void *)-4)
4815         {
4816           if (nested_in_vect_loop_p (loop, stmt_info))
4817             return opt_result::failure_at (stmt_info->stmt,
4818                                            "not vectorized:"
4819                                            " data ref analysis failed: %G",
4820                                            stmt_info->stmt);
4821           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4822             = -(uintptr_t) dr->aux;
4823         }
4824
4825       tree base = get_base_address (DR_REF (dr));
4826       if (base && VAR_P (base) && DECL_NONALIASED (base))
4827         {
4828           if (dump_enabled_p ())
4829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4830                              "not vectorized: base object not addressable "
4831                              "for stmt: %G", stmt_info->stmt);
4832           if (is_a <bb_vec_info> (vinfo))
4833             {
4834               /* In BB vectorization the ref can still participate
4835                  in dependence analysis, we just can't vectorize it.  */
4836               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4837               continue;
4838             }
4839           return opt_result::failure_at (stmt_info->stmt,
4840                                          "not vectorized: base object not"
4841                                          " addressable for stmt: %G",
4842                                          stmt_info->stmt);
4843         }
4844
4845       if (is_a <loop_vec_info> (vinfo)
4846           && DR_STEP (dr)
4847           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4848         {
4849           if (nested_in_vect_loop_p (loop, stmt_info))
4850             return opt_result::failure_at (stmt_info->stmt,
4851                                            "not vectorized: "
4852                                            "not suitable for strided load %G",
4853                                            stmt_info->stmt);
4854           STMT_VINFO_STRIDED_P (stmt_info) = true;
4855         }
4856
4857       /* Update DR field in stmt_vec_info struct.  */
4858
4859       /* If the dataref is in an inner-loop of the loop that is considered for
4860          for vectorization, we also want to analyze the access relative to
4861          the outer-loop (DR contains information only relative to the
4862          inner-most enclosing loop).  We do that by building a reference to the
4863          first location accessed by the inner-loop, and analyze it relative to
4864          the outer-loop.  */
4865       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4866         {
4867           /* Build a reference to the first location accessed by the
4868              inner loop: *(BASE + INIT + OFFSET).  By construction,
4869              this address must be invariant in the inner loop, so we
4870              can consider it as being used in the outer loop.  */
4871           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4872           tree offset = unshare_expr (DR_OFFSET (dr));
4873           tree init = unshare_expr (DR_INIT (dr));
4874           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4875                                           init, offset);
4876           tree init_addr = fold_build_pointer_plus (base, init_offset);
4877           tree init_ref = build_fold_indirect_ref (init_addr);
4878
4879           if (dump_enabled_p ())
4880             dump_printf_loc (MSG_NOTE, vect_location,
4881                              "analyze in outer loop: %T\n", init_ref);
4882
4883           opt_result res
4884             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4885                                     init_ref, loop, stmt_info->stmt);
4886           if (!res)
4887             /* dr_analyze_innermost already explained the failure.  */
4888             return res;
4889
4890           if (dump_enabled_p ())
4891             dump_printf_loc (MSG_NOTE, vect_location,
4892                              "\touter base_address: %T\n"
4893                              "\touter offset from base address: %T\n"
4894                              "\touter constant offset from base address: %T\n"
4895                              "\touter step: %T\n"
4896                              "\touter base alignment: %d\n\n"
4897                              "\touter base misalignment: %d\n"
4898                              "\touter offset alignment: %d\n"
4899                              "\touter step alignment: %d\n",
4900                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4901                              STMT_VINFO_DR_OFFSET (stmt_info),
4902                              STMT_VINFO_DR_INIT (stmt_info),
4903                              STMT_VINFO_DR_STEP (stmt_info),
4904                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4905                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4906                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4907                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4908         }
4909
4910       /* Set vectype for STMT.  */
4911       scalar_type = TREE_TYPE (DR_REF (dr));
4912       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4913       if (!vectype)
4914         {
4915           if (dump_enabled_p ())
4916             {
4917               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4918                                "not vectorized: no vectype for stmt: %G",
4919                                stmt_info->stmt);
4920               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4921               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4922                                  scalar_type);
4923               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4924             }
4925
4926           if (is_a <bb_vec_info> (vinfo))
4927             {
4928               /* No vector type is fine, the ref can still participate
4929                  in dependence analysis, we just can't vectorize it.  */
4930               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4931               continue;
4932             }
4933           if (fatal)
4934             *fatal = false;
4935           return opt_result::failure_at (stmt_info->stmt,
4936                                          "not vectorized:"
4937                                          " no vectype for stmt: %G"
4938                                          " scalar_type: %T\n",
4939                                          stmt_info->stmt, scalar_type);
4940         }
4941       else
4942         {
4943           if (dump_enabled_p ())
4944             dump_printf_loc (MSG_NOTE, vect_location,
4945                              "got vectype for stmt: %G%T\n",
4946                              stmt_info->stmt, vectype);
4947         }
4948
4949       /* Adjust the minimal vectorization factor according to the
4950          vector type.  */
4951       vf = TYPE_VECTOR_SUBPARTS (vectype);
4952       *min_vf = upper_bound (*min_vf, vf);
4953
4954       /* Leave the BB vectorizer to pick the vector type later, based on
4955          the final dataref group size and SLP node size.  */
4956       if (is_a <loop_vec_info> (vinfo))
4957         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4958
4959       if (gatherscatter != SG_NONE)
4960         {
4961           gather_scatter_info gs_info;
4962           if (!vect_check_gather_scatter (stmt_info,
4963                                           as_a <loop_vec_info> (vinfo),
4964                                           &gs_info)
4965               || !get_vectype_for_scalar_type (vinfo,
4966                                                TREE_TYPE (gs_info.offset)))
4967             {
4968               if (fatal)
4969                 *fatal = false;
4970               return opt_result::failure_at
4971                         (stmt_info->stmt,
4972                          (gatherscatter == GATHER)
4973                          ? "not vectorized: not suitable for gather load %G"
4974                          : "not vectorized: not suitable for scatter store %G",
4975                          stmt_info->stmt);
4976             }
4977           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4978         }
4979     }
4980
4981   /* We used to stop processing and prune the list here.  Verify we no
4982      longer need to.  */
4983   gcc_assert (i == datarefs.length ());
4984
4985   return opt_result::success ();
4986 }
4987
4988
4989 /* Function vect_get_new_vect_var.
4990
4991    Returns a name for a new variable.  The current naming scheme appends the
4992    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4993    the name of vectorizer generated variables, and appends that to NAME if
4994    provided.  */
4995
4996 tree
4997 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4998 {
4999   const char *prefix;
5000   tree new_vect_var;
5001
5002   switch (var_kind)
5003   {
5004   case vect_simple_var:
5005     prefix = "vect";
5006     break;
5007   case vect_scalar_var:
5008     prefix = "stmp";
5009     break;
5010   case vect_mask_var:
5011     prefix = "mask";
5012     break;
5013   case vect_pointer_var:
5014     prefix = "vectp";
5015     break;
5016   default:
5017     gcc_unreachable ();
5018   }
5019
5020   if (name)
5021     {
5022       char* tmp = concat (prefix, "_", name, NULL);
5023       new_vect_var = create_tmp_reg (type, tmp);
5024       free (tmp);
5025     }
5026   else
5027     new_vect_var = create_tmp_reg (type, prefix);
5028
5029   return new_vect_var;
5030 }
5031
5032 /* Like vect_get_new_vect_var but return an SSA name.  */
5033
5034 tree
5035 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5036 {
5037   const char *prefix;
5038   tree new_vect_var;
5039
5040   switch (var_kind)
5041   {
5042   case vect_simple_var:
5043     prefix = "vect";
5044     break;
5045   case vect_scalar_var:
5046     prefix = "stmp";
5047     break;
5048   case vect_pointer_var:
5049     prefix = "vectp";
5050     break;
5051   default:
5052     gcc_unreachable ();
5053   }
5054
5055   if (name)
5056     {
5057       char* tmp = concat (prefix, "_", name, NULL);
5058       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5059       free (tmp);
5060     }
5061   else
5062     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5063
5064   return new_vect_var;
5065 }
5066
5067 /* Duplicate points-to info on NAME from DR_INFO.  */
5068
5069 static void
5070 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5071 {
5072   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5073   /* DR_PTR_INFO is for a base SSA name, not including constant or
5074      variable offsets in the ref so its alignment info does not apply.  */
5075   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5076 }
5077
5078 /* Function vect_create_addr_base_for_vector_ref.
5079
5080    Create an expression that computes the address of the first memory location
5081    that will be accessed for a data reference.
5082
5083    Input:
5084    STMT_INFO: The statement containing the data reference.
5085    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5086    OFFSET: Optional. If supplied, it is be added to the initial address.
5087    LOOP:    Specify relative to which loop-nest should the address be computed.
5088             For example, when the dataref is in an inner-loop nested in an
5089             outer-loop that is now being vectorized, LOOP can be either the
5090             outer-loop, or the inner-loop.  The first memory location accessed
5091             by the following dataref ('in' points to short):
5092
5093                 for (i=0; i<N; i++)
5094                    for (j=0; j<M; j++)
5095                      s += in[i+j]
5096
5097             is as follows:
5098             if LOOP=i_loop:     &in             (relative to i_loop)
5099             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5100
5101    Output:
5102    1. Return an SSA_NAME whose value is the address of the memory location of
5103       the first vector of the data reference.
5104    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5105       these statement(s) which define the returned SSA_NAME.
5106
5107    FORNOW: We are only handling array accesses with step 1.  */
5108
5109 tree
5110 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5111                                       gimple_seq *new_stmt_list,
5112                                       tree offset)
5113 {
5114   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5115   struct data_reference *dr = dr_info->dr;
5116   const char *base_name;
5117   tree addr_base;
5118   tree dest;
5119   gimple_seq seq = NULL;
5120   tree vect_ptr_type;
5121   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5122   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5123
5124   tree data_ref_base = unshare_expr (drb->base_address);
5125   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5126   tree init = unshare_expr (drb->init);
5127
5128   if (loop_vinfo)
5129     base_name = get_name (data_ref_base);
5130   else
5131     {
5132       base_offset = ssize_int (0);
5133       init = ssize_int (0);
5134       base_name = get_name (DR_REF (dr));
5135     }
5136
5137   /* Create base_offset */
5138   base_offset = size_binop (PLUS_EXPR,
5139                             fold_convert (sizetype, base_offset),
5140                             fold_convert (sizetype, init));
5141
5142   if (offset)
5143     {
5144       offset = fold_convert (sizetype, offset);
5145       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5146                                  base_offset, offset);
5147     }
5148
5149   /* base + base_offset */
5150   if (loop_vinfo)
5151     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5152   else
5153     addr_base = build1 (ADDR_EXPR,
5154                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5155                         /* Strip zero offset components since we don't need
5156                            them and they can confuse late diagnostics if
5157                            we CSE them wrongly.  See PR106904 for example.  */
5158                         unshare_expr (strip_zero_offset_components
5159                                                                 (DR_REF (dr))));
5160
5161   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5162   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5163   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5164   gimple_seq_add_seq (new_stmt_list, seq);
5165
5166   if (DR_PTR_INFO (dr)
5167       && TREE_CODE (addr_base) == SSA_NAME
5168       /* We should only duplicate pointer info to newly created SSA names.  */
5169       && SSA_NAME_VAR (addr_base) == dest)
5170     {
5171       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5172       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5173     }
5174
5175   if (dump_enabled_p ())
5176     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5177
5178   return addr_base;
5179 }
5180
5181
5182 /* Function vect_create_data_ref_ptr.
5183
5184    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5185    location accessed in the loop by STMT_INFO, along with the def-use update
5186    chain to appropriately advance the pointer through the loop iterations.
5187    Also set aliasing information for the pointer.  This pointer is used by
5188    the callers to this function to create a memory reference expression for
5189    vector load/store access.
5190
5191    Input:
5192    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5193          GIMPLE_ASSIGN <name, data-ref> or
5194          GIMPLE_ASSIGN <data-ref, name>.
5195    2. AGGR_TYPE: the type of the reference, which should be either a vector
5196         or an array.
5197    3. AT_LOOP: the loop where the vector memref is to be created.
5198    4. OFFSET (optional): a byte offset to be added to the initial address
5199         accessed by the data-ref in STMT_INFO.
5200    5. BSI: location where the new stmts are to be placed if there is no loop
5201    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5202         pointing to the initial address.
5203    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5204         to the IV during each iteration of the loop.  NULL says to move
5205         by one copy of AGGR_TYPE up or down, depending on the step of the
5206         data reference.
5207
5208    Output:
5209    1. Declare a new ptr to vector_type, and have it point to the base of the
5210       data reference (initial addressed accessed by the data reference).
5211       For example, for vector of type V8HI, the following code is generated:
5212
5213       v8hi *ap;
5214       ap = (v8hi *)initial_address;
5215
5216       if OFFSET is not supplied:
5217          initial_address = &a[init];
5218       if OFFSET is supplied:
5219          initial_address = &a[init] + OFFSET;
5220       if BYTE_OFFSET is supplied:
5221          initial_address = &a[init] + BYTE_OFFSET;
5222
5223       Return the initial_address in INITIAL_ADDRESS.
5224
5225    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5226       update the pointer in each iteration of the loop.
5227
5228       Return the increment stmt that updates the pointer in PTR_INCR.
5229
5230    3. Return the pointer.  */
5231
5232 tree
5233 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5234                           tree aggr_type, class loop *at_loop, tree offset,
5235                           tree *initial_address, gimple_stmt_iterator *gsi,
5236                           gimple **ptr_incr, bool only_init,
5237                           tree iv_step)
5238 {
5239   const char *base_name;
5240   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5241   class loop *loop = NULL;
5242   bool nested_in_vect_loop = false;
5243   class loop *containing_loop = NULL;
5244   tree aggr_ptr_type;
5245   tree aggr_ptr;
5246   tree new_temp;
5247   gimple_seq new_stmt_list = NULL;
5248   edge pe = NULL;
5249   basic_block new_bb;
5250   tree aggr_ptr_init;
5251   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5252   struct data_reference *dr = dr_info->dr;
5253   tree aptr;
5254   gimple_stmt_iterator incr_gsi;
5255   bool insert_after;
5256   tree indx_before_incr, indx_after_incr;
5257   gimple *incr;
5258   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5259
5260   gcc_assert (iv_step != NULL_TREE
5261               || TREE_CODE (aggr_type) == ARRAY_TYPE
5262               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5263
5264   if (loop_vinfo)
5265     {
5266       loop = LOOP_VINFO_LOOP (loop_vinfo);
5267       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5268       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5269       pe = loop_preheader_edge (loop);
5270     }
5271   else
5272     {
5273       gcc_assert (bb_vinfo);
5274       only_init = true;
5275       *ptr_incr = NULL;
5276     }
5277
5278   /* Create an expression for the first address accessed by this load
5279      in LOOP.  */
5280   base_name = get_name (DR_BASE_ADDRESS (dr));
5281
5282   if (dump_enabled_p ())
5283     {
5284       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5285       dump_printf_loc (MSG_NOTE, vect_location,
5286                        "create %s-pointer variable to type: %T",
5287                        get_tree_code_name (TREE_CODE (aggr_type)),
5288                        aggr_type);
5289       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5290         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5291       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5292         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5293       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5294         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5295       else
5296         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5297       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5298     }
5299
5300   /* (1) Create the new aggregate-pointer variable.
5301      Vector and array types inherit the alias set of their component
5302      type by default so we need to use a ref-all pointer if the data
5303      reference does not conflict with the created aggregated data
5304      reference because it is not addressable.  */
5305   bool need_ref_all = false;
5306   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5307                               get_alias_set (DR_REF (dr))))
5308     need_ref_all = true;
5309   /* Likewise for any of the data references in the stmt group.  */
5310   else if (DR_GROUP_SIZE (stmt_info) > 1)
5311     {
5312       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5313       do
5314         {
5315           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5316           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5317                                       get_alias_set (DR_REF (sdr))))
5318             {
5319               need_ref_all = true;
5320               break;
5321             }
5322           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5323         }
5324       while (sinfo);
5325     }
5326   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5327                                                need_ref_all);
5328   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5329
5330
5331   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5332      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5333      def-use update cycles for the pointer: one relative to the outer-loop
5334      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5335      to the inner-loop (which is the inner-most loop containing the dataref),
5336      and this is done be step (5) below.
5337
5338      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5339      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5340      redundant.  Steps (3),(4) create the following:
5341
5342         vp0 = &base_addr;
5343         LOOP:   vp1 = phi(vp0,vp2)
5344                 ...
5345                 ...
5346                 vp2 = vp1 + step
5347                 goto LOOP
5348
5349      If there is an inner-loop nested in loop, then step (5) will also be
5350      applied, and an additional update in the inner-loop will be created:
5351
5352         vp0 = &base_addr;
5353         LOOP:   vp1 = phi(vp0,vp2)
5354                 ...
5355         inner:     vp3 = phi(vp1,vp4)
5356                    vp4 = vp3 + inner_step
5357                    if () goto inner
5358                 ...
5359                 vp2 = vp1 + step
5360                 if () goto LOOP   */
5361
5362   /* (2) Calculate the initial address of the aggregate-pointer, and set
5363      the aggregate-pointer to point to it before the loop.  */
5364
5365   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5366
5367   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5368                                                    stmt_info, &new_stmt_list,
5369                                                    offset);
5370   if (new_stmt_list)
5371     {
5372       if (pe)
5373         {
5374           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5375           gcc_assert (!new_bb);
5376         }
5377       else
5378         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5379     }
5380
5381   *initial_address = new_temp;
5382   aggr_ptr_init = new_temp;
5383
5384   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5385      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5386      inner-loop nested in LOOP (during outer-loop vectorization).  */
5387
5388   /* No update in loop is required.  */
5389   if (only_init && (!loop_vinfo || at_loop == loop))
5390     aptr = aggr_ptr_init;
5391   else
5392     {
5393       /* Accesses to invariant addresses should be handled specially
5394          by the caller.  */
5395       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5396       gcc_assert (!integer_zerop (step));
5397
5398       if (iv_step == NULL_TREE)
5399         {
5400           /* The step of the aggregate pointer is the type size,
5401              negated for downward accesses.  */
5402           iv_step = TYPE_SIZE_UNIT (aggr_type);
5403           if (tree_int_cst_sgn (step) == -1)
5404             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5405         }
5406
5407       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5408
5409       create_iv (aggr_ptr_init, PLUS_EXPR,
5410                  fold_convert (aggr_ptr_type, iv_step),
5411                  aggr_ptr, loop, &incr_gsi, insert_after,
5412                  &indx_before_incr, &indx_after_incr);
5413       incr = gsi_stmt (incr_gsi);
5414
5415       /* Copy the points-to information if it exists. */
5416       if (DR_PTR_INFO (dr))
5417         {
5418           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5419           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5420         }
5421       if (ptr_incr)
5422         *ptr_incr = incr;
5423
5424       aptr = indx_before_incr;
5425     }
5426
5427   if (!nested_in_vect_loop || only_init)
5428     return aptr;
5429
5430
5431   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5432      nested in LOOP, if exists.  */
5433
5434   gcc_assert (nested_in_vect_loop);
5435   if (!only_init)
5436     {
5437       standard_iv_increment_position (containing_loop, &incr_gsi,
5438                                       &insert_after);
5439       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5440                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5441                  &indx_before_incr, &indx_after_incr);
5442       incr = gsi_stmt (incr_gsi);
5443
5444       /* Copy the points-to information if it exists. */
5445       if (DR_PTR_INFO (dr))
5446         {
5447           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5448           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5449         }
5450       if (ptr_incr)
5451         *ptr_incr = incr;
5452
5453       return indx_before_incr;
5454     }
5455   else
5456     gcc_unreachable ();
5457 }
5458
5459
5460 /* Function bump_vector_ptr
5461
5462    Increment a pointer (to a vector type) by vector-size. If requested,
5463    i.e. if PTR-INCR is given, then also connect the new increment stmt
5464    to the existing def-use update-chain of the pointer, by modifying
5465    the PTR_INCR as illustrated below:
5466
5467    The pointer def-use update-chain before this function:
5468                         DATAREF_PTR = phi (p_0, p_2)
5469                         ....
5470         PTR_INCR:       p_2 = DATAREF_PTR + step
5471
5472    The pointer def-use update-chain after this function:
5473                         DATAREF_PTR = phi (p_0, p_2)
5474                         ....
5475                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5476                         ....
5477         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5478
5479    Input:
5480    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5481                  in the loop.
5482    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5483               the loop.  The increment amount across iterations is expected
5484               to be vector_size.
5485    BSI - location where the new update stmt is to be placed.
5486    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5487    BUMP - optional. The offset by which to bump the pointer. If not given,
5488           the offset is assumed to be vector_size.
5489
5490    Output: Return NEW_DATAREF_PTR as illustrated above.
5491
5492 */
5493
5494 tree
5495 bump_vector_ptr (vec_info *vinfo,
5496                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5497                  stmt_vec_info stmt_info, tree bump)
5498 {
5499   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5500   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5501   tree update = TYPE_SIZE_UNIT (vectype);
5502   gimple *incr_stmt;
5503   ssa_op_iter iter;
5504   use_operand_p use_p;
5505   tree new_dataref_ptr;
5506
5507   if (bump)
5508     update = bump;
5509
5510   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5511     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5512   else if (is_gimple_min_invariant (dataref_ptr))
5513     /* When possible avoid emitting a separate increment stmt that will
5514        force the addressed object addressable.  */
5515     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5516                    fold_build2 (MEM_REF,
5517                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5518                                 dataref_ptr,
5519                                 fold_convert (ptr_type_node, update)));
5520   else
5521     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5522   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5523                                    dataref_ptr, update);
5524   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5525   /* Fold the increment, avoiding excessive chains use-def chains of
5526      those, leading to compile-time issues for passes until the next
5527      forwprop pass which would do this as well.  */
5528   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5529   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5530     {
5531       incr_stmt = gsi_stmt (fold_gsi);
5532       update_stmt (incr_stmt);
5533     }
5534
5535   /* Copy the points-to information if it exists. */
5536   if (DR_PTR_INFO (dr))
5537     {
5538       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5539       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5540     }
5541
5542   if (!ptr_incr)
5543     return new_dataref_ptr;
5544
5545   /* Update the vector-pointer's cross-iteration increment.  */
5546   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5547     {
5548       tree use = USE_FROM_PTR (use_p);
5549
5550       if (use == dataref_ptr)
5551         SET_USE (use_p, new_dataref_ptr);
5552       else
5553         gcc_assert (operand_equal_p (use, update, 0));
5554     }
5555
5556   return new_dataref_ptr;
5557 }
5558
5559
5560 /* Copy memory reference info such as base/clique from the SRC reference
5561    to the DEST MEM_REF.  */
5562
5563 void
5564 vect_copy_ref_info (tree dest, tree src)
5565 {
5566   if (TREE_CODE (dest) != MEM_REF)
5567     return;
5568
5569   tree src_base = src;
5570   while (handled_component_p (src_base))
5571     src_base = TREE_OPERAND (src_base, 0);
5572   if (TREE_CODE (src_base) != MEM_REF
5573       && TREE_CODE (src_base) != TARGET_MEM_REF)
5574     return;
5575
5576   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5577   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5578 }
5579
5580
5581 /* Function vect_create_destination_var.
5582
5583    Create a new temporary of type VECTYPE.  */
5584
5585 tree
5586 vect_create_destination_var (tree scalar_dest, tree vectype)
5587 {
5588   tree vec_dest;
5589   const char *name;
5590   char *new_name;
5591   tree type;
5592   enum vect_var_kind kind;
5593
5594   kind = vectype
5595     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5596     ? vect_mask_var
5597     : vect_simple_var
5598     : vect_scalar_var;
5599   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5600
5601   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5602
5603   name = get_name (scalar_dest);
5604   if (name)
5605     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5606   else
5607     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5608   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5609   free (new_name);
5610
5611   return vec_dest;
5612 }
5613
5614 /* Function vect_grouped_store_supported.
5615
5616    Returns TRUE if interleave high and interleave low permutations
5617    are supported, and FALSE otherwise.  */
5618
5619 bool
5620 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5621 {
5622   machine_mode mode = TYPE_MODE (vectype);
5623
5624   /* vect_permute_store_chain requires the group size to be equal to 3 or
5625      be a power of two.  */
5626   if (count != 3 && exact_log2 (count) == -1)
5627     {
5628       if (dump_enabled_p ())
5629         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630                          "the size of the group of accesses"
5631                          " is not a power of 2 or not eqaul to 3\n");
5632       return false;
5633     }
5634
5635   /* Check that the permutation is supported.  */
5636   if (VECTOR_MODE_P (mode))
5637     {
5638       unsigned int i;
5639       if (count == 3)
5640         {
5641           unsigned int j0 = 0, j1 = 0, j2 = 0;
5642           unsigned int i, j;
5643
5644           unsigned int nelt;
5645           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5646             {
5647               if (dump_enabled_p ())
5648                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649                                  "cannot handle groups of 3 stores for"
5650                                  " variable-length vectors\n");
5651               return false;
5652             }
5653
5654           vec_perm_builder sel (nelt, nelt, 1);
5655           sel.quick_grow (nelt);
5656           vec_perm_indices indices;
5657           for (j = 0; j < 3; j++)
5658             {
5659               int nelt0 = ((3 - j) * nelt) % 3;
5660               int nelt1 = ((3 - j) * nelt + 1) % 3;
5661               int nelt2 = ((3 - j) * nelt + 2) % 3;
5662               for (i = 0; i < nelt; i++)
5663                 {
5664                   if (3 * i + nelt0 < nelt)
5665                     sel[3 * i + nelt0] = j0++;
5666                   if (3 * i + nelt1 < nelt)
5667                     sel[3 * i + nelt1] = nelt + j1++;
5668                   if (3 * i + nelt2 < nelt)
5669                     sel[3 * i + nelt2] = 0;
5670                 }
5671               indices.new_vector (sel, 2, nelt);
5672               if (!can_vec_perm_const_p (mode, mode, indices))
5673                 {
5674                   if (dump_enabled_p ())
5675                     dump_printf (MSG_MISSED_OPTIMIZATION,
5676                                  "permutation op not supported by target.\n");
5677                   return false;
5678                 }
5679
5680               for (i = 0; i < nelt; i++)
5681                 {
5682                   if (3 * i + nelt0 < nelt)
5683                     sel[3 * i + nelt0] = 3 * i + nelt0;
5684                   if (3 * i + nelt1 < nelt)
5685                     sel[3 * i + nelt1] = 3 * i + nelt1;
5686                   if (3 * i + nelt2 < nelt)
5687                     sel[3 * i + nelt2] = nelt + j2++;
5688                 }
5689               indices.new_vector (sel, 2, nelt);
5690               if (!can_vec_perm_const_p (mode, mode, indices))
5691                 {
5692                   if (dump_enabled_p ())
5693                     dump_printf (MSG_MISSED_OPTIMIZATION,
5694                                  "permutation op not supported by target.\n");
5695                   return false;
5696                 }
5697             }
5698           return true;
5699         }
5700       else
5701         {
5702           /* If length is not equal to 3 then only power of 2 is supported.  */
5703           gcc_assert (pow2p_hwi (count));
5704           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5705
5706           /* The encoding has 2 interleaved stepped patterns.  */
5707           if(!multiple_p (nelt, 2))
5708             return false;
5709           vec_perm_builder sel (nelt, 2, 3);
5710           sel.quick_grow (6);
5711           for (i = 0; i < 3; i++)
5712             {
5713               sel[i * 2] = i;
5714               sel[i * 2 + 1] = i + nelt;
5715             }
5716           vec_perm_indices indices (sel, 2, nelt);
5717           if (can_vec_perm_const_p (mode, mode, indices))
5718             {
5719               for (i = 0; i < 6; i++)
5720                 sel[i] += exact_div (nelt, 2);
5721               indices.new_vector (sel, 2, nelt);
5722               if (can_vec_perm_const_p (mode, mode, indices))
5723                 return true;
5724             }
5725         }
5726     }
5727
5728   if (dump_enabled_p ())
5729     dump_printf (MSG_MISSED_OPTIMIZATION,
5730                  "permutation op not supported by target.\n");
5731   return false;
5732 }
5733
5734 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5735    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5736
5737 internal_fn
5738 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5739                             bool masked_p)
5740 {
5741   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5742                                     vec_mask_len_store_lanes_optab, vectype,
5743                                     count))
5744     return IFN_MASK_LEN_STORE_LANES;
5745   else if (masked_p)
5746     {
5747       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5748                                         vec_mask_store_lanes_optab, vectype,
5749                                         count))
5750         return IFN_MASK_STORE_LANES;
5751     }
5752   else
5753     {
5754       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5755                                         vec_store_lanes_optab, vectype, count))
5756         return IFN_STORE_LANES;
5757     }
5758   return IFN_LAST;
5759 }
5760
5761
5762 /* Function vect_permute_store_chain.
5763
5764    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5765    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5766    the data correctly for the stores.  Return the final references for stores
5767    in RESULT_CHAIN.
5768
5769    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5770    The input is 4 vectors each containing 8 elements.  We assign a number to
5771    each element, the input sequence is:
5772
5773    1st vec:   0  1  2  3  4  5  6  7
5774    2nd vec:   8  9 10 11 12 13 14 15
5775    3rd vec:  16 17 18 19 20 21 22 23
5776    4th vec:  24 25 26 27 28 29 30 31
5777
5778    The output sequence should be:
5779
5780    1st vec:  0  8 16 24  1  9 17 25
5781    2nd vec:  2 10 18 26  3 11 19 27
5782    3rd vec:  4 12 20 28  5 13 21 30
5783    4th vec:  6 14 22 30  7 15 23 31
5784
5785    i.e., we interleave the contents of the four vectors in their order.
5786
5787    We use interleave_high/low instructions to create such output.  The input of
5788    each interleave_high/low operation is two vectors:
5789    1st vec    2nd vec
5790    0 1 2 3    4 5 6 7
5791    the even elements of the result vector are obtained left-to-right from the
5792    high/low elements of the first vector.  The odd elements of the result are
5793    obtained left-to-right from the high/low elements of the second vector.
5794    The output of interleave_high will be:   0 4 1 5
5795    and of interleave_low:                   2 6 3 7
5796
5797
5798    The permutation is done in log LENGTH stages.  In each stage interleave_high
5799    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5800    where the first argument is taken from the first half of DR_CHAIN and the
5801    second argument from it's second half.
5802    In our example,
5803
5804    I1: interleave_high (1st vec, 3rd vec)
5805    I2: interleave_low (1st vec, 3rd vec)
5806    I3: interleave_high (2nd vec, 4th vec)
5807    I4: interleave_low (2nd vec, 4th vec)
5808
5809    The output for the first stage is:
5810
5811    I1:  0 16  1 17  2 18  3 19
5812    I2:  4 20  5 21  6 22  7 23
5813    I3:  8 24  9 25 10 26 11 27
5814    I4: 12 28 13 29 14 30 15 31
5815
5816    The output of the second stage, i.e. the final result is:
5817
5818    I1:  0  8 16 24  1  9 17 25
5819    I2:  2 10 18 26  3 11 19 27
5820    I3:  4 12 20 28  5 13 21 30
5821    I4:  6 14 22 30  7 15 23 31.  */
5822
5823 void
5824 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5825                           unsigned int length,
5826                           stmt_vec_info stmt_info,
5827                           gimple_stmt_iterator *gsi,
5828                           vec<tree> *result_chain)
5829 {
5830   tree vect1, vect2, high, low;
5831   gimple *perm_stmt;
5832   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5833   tree perm_mask_low, perm_mask_high;
5834   tree data_ref;
5835   tree perm3_mask_low, perm3_mask_high;
5836   unsigned int i, j, n, log_length = exact_log2 (length);
5837
5838   result_chain->quick_grow (length);
5839   memcpy (result_chain->address (), dr_chain.address (),
5840           length * sizeof (tree));
5841
5842   if (length == 3)
5843     {
5844       /* vect_grouped_store_supported ensures that this is constant.  */
5845       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5846       unsigned int j0 = 0, j1 = 0, j2 = 0;
5847
5848       vec_perm_builder sel (nelt, nelt, 1);
5849       sel.quick_grow (nelt);
5850       vec_perm_indices indices;
5851       for (j = 0; j < 3; j++)
5852         {
5853           int nelt0 = ((3 - j) * nelt) % 3;
5854           int nelt1 = ((3 - j) * nelt + 1) % 3;
5855           int nelt2 = ((3 - j) * nelt + 2) % 3;
5856
5857           for (i = 0; i < nelt; i++)
5858             {
5859               if (3 * i + nelt0 < nelt)
5860                 sel[3 * i + nelt0] = j0++;
5861               if (3 * i + nelt1 < nelt)
5862                 sel[3 * i + nelt1] = nelt + j1++;
5863               if (3 * i + nelt2 < nelt)
5864                 sel[3 * i + nelt2] = 0;
5865             }
5866           indices.new_vector (sel, 2, nelt);
5867           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5868
5869           for (i = 0; i < nelt; i++)
5870             {
5871               if (3 * i + nelt0 < nelt)
5872                 sel[3 * i + nelt0] = 3 * i + nelt0;
5873               if (3 * i + nelt1 < nelt)
5874                 sel[3 * i + nelt1] = 3 * i + nelt1;
5875               if (3 * i + nelt2 < nelt)
5876                 sel[3 * i + nelt2] = nelt + j2++;
5877             }
5878           indices.new_vector (sel, 2, nelt);
5879           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5880
5881           vect1 = dr_chain[0];
5882           vect2 = dr_chain[1];
5883
5884           /* Create interleaving stmt:
5885              low = VEC_PERM_EXPR <vect1, vect2,
5886                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5887                                    j + 2, nelt + j + 2, *, ...}>  */
5888           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5889           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5890                                            vect2, perm3_mask_low);
5891           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5892
5893           vect1 = data_ref;
5894           vect2 = dr_chain[2];
5895           /* Create interleaving stmt:
5896              low = VEC_PERM_EXPR <vect1, vect2,
5897                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5898                                    6, 7, nelt + j + 2, ...}>  */
5899           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5900           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5901                                            vect2, perm3_mask_high);
5902           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5903           (*result_chain)[j] = data_ref;
5904         }
5905     }
5906   else
5907     {
5908       /* If length is not equal to 3 then only power of 2 is supported.  */
5909       gcc_assert (pow2p_hwi (length));
5910
5911       /* The encoding has 2 interleaved stepped patterns.  */
5912       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5913       vec_perm_builder sel (nelt, 2, 3);
5914       sel.quick_grow (6);
5915       for (i = 0; i < 3; i++)
5916         {
5917           sel[i * 2] = i;
5918           sel[i * 2 + 1] = i + nelt;
5919         }
5920         vec_perm_indices indices (sel, 2, nelt);
5921         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5922
5923         for (i = 0; i < 6; i++)
5924           sel[i] += exact_div (nelt, 2);
5925         indices.new_vector (sel, 2, nelt);
5926         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5927
5928         for (i = 0, n = log_length; i < n; i++)
5929           {
5930             for (j = 0; j < length/2; j++)
5931               {
5932                 vect1 = dr_chain[j];
5933                 vect2 = dr_chain[j+length/2];
5934
5935                 /* Create interleaving stmt:
5936                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5937                                                         ...}>  */
5938                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5939                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5940                                                  vect2, perm_mask_high);
5941                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5942                 (*result_chain)[2*j] = high;
5943
5944                 /* Create interleaving stmt:
5945                    low = VEC_PERM_EXPR <vect1, vect2,
5946                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5947                                          ...}>  */
5948                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5949                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5950                                                  vect2, perm_mask_low);
5951                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5952                 (*result_chain)[2*j+1] = low;
5953               }
5954             memcpy (dr_chain.address (), result_chain->address (),
5955                     length * sizeof (tree));
5956           }
5957     }
5958 }
5959
5960 /* Function vect_setup_realignment
5961
5962    This function is called when vectorizing an unaligned load using
5963    the dr_explicit_realign[_optimized] scheme.
5964    This function generates the following code at the loop prolog:
5965
5966       p = initial_addr;
5967    x  msq_init = *(floor(p));   # prolog load
5968       realignment_token = call target_builtin;
5969     loop:
5970    x  msq = phi (msq_init, ---)
5971
5972    The stmts marked with x are generated only for the case of
5973    dr_explicit_realign_optimized.
5974
5975    The code above sets up a new (vector) pointer, pointing to the first
5976    location accessed by STMT_INFO, and a "floor-aligned" load using that
5977    pointer.  It also generates code to compute the "realignment-token"
5978    (if the relevant target hook was defined), and creates a phi-node at the
5979    loop-header bb whose arguments are the result of the prolog-load (created
5980    by this function) and the result of a load that takes place in the loop
5981    (to be created by the caller to this function).
5982
5983    For the case of dr_explicit_realign_optimized:
5984    The caller to this function uses the phi-result (msq) to create the
5985    realignment code inside the loop, and sets up the missing phi argument,
5986    as follows:
5987     loop:
5988       msq = phi (msq_init, lsq)
5989       lsq = *(floor(p'));        # load in loop
5990       result = realign_load (msq, lsq, realignment_token);
5991
5992    For the case of dr_explicit_realign:
5993     loop:
5994       msq = *(floor(p));        # load in loop
5995       p' = p + (VS-1);
5996       lsq = *(floor(p'));       # load in loop
5997       result = realign_load (msq, lsq, realignment_token);
5998
5999    Input:
6000    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6001                a memory location that may be unaligned.
6002    BSI - place where new code is to be inserted.
6003    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6004                               is used.
6005
6006    Output:
6007    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6008                        target hook, if defined.
6009    Return value - the result of the loop-header phi node.  */
6010
6011 tree
6012 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6013                         gimple_stmt_iterator *gsi, tree *realignment_token,
6014                         enum dr_alignment_support alignment_support_scheme,
6015                         tree init_addr,
6016                         class loop **at_loop)
6017 {
6018   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6019   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6020   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6021   struct data_reference *dr = dr_info->dr;
6022   class loop *loop = NULL;
6023   edge pe = NULL;
6024   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6025   tree vec_dest;
6026   gimple *inc;
6027   tree ptr;
6028   tree data_ref;
6029   basic_block new_bb;
6030   tree msq_init = NULL_TREE;
6031   tree new_temp;
6032   gphi *phi_stmt;
6033   tree msq = NULL_TREE;
6034   gimple_seq stmts = NULL;
6035   bool compute_in_loop = false;
6036   bool nested_in_vect_loop = false;
6037   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6038   class loop *loop_for_initial_load = NULL;
6039
6040   if (loop_vinfo)
6041     {
6042       loop = LOOP_VINFO_LOOP (loop_vinfo);
6043       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6044     }
6045
6046   gcc_assert (alignment_support_scheme == dr_explicit_realign
6047               || alignment_support_scheme == dr_explicit_realign_optimized);
6048
6049   /* We need to generate three things:
6050      1. the misalignment computation
6051      2. the extra vector load (for the optimized realignment scheme).
6052      3. the phi node for the two vectors from which the realignment is
6053       done (for the optimized realignment scheme).  */
6054
6055   /* 1. Determine where to generate the misalignment computation.
6056
6057      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6058      calculation will be generated by this function, outside the loop (in the
6059      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6060      caller, inside the loop.
6061
6062      Background: If the misalignment remains fixed throughout the iterations of
6063      the loop, then both realignment schemes are applicable, and also the
6064      misalignment computation can be done outside LOOP.  This is because we are
6065      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6066      are a multiple of VS (the Vector Size), and therefore the misalignment in
6067      different vectorized LOOP iterations is always the same.
6068      The problem arises only if the memory access is in an inner-loop nested
6069      inside LOOP, which is now being vectorized using outer-loop vectorization.
6070      This is the only case when the misalignment of the memory access may not
6071      remain fixed throughout the iterations of the inner-loop (as explained in
6072      detail in vect_supportable_dr_alignment).  In this case, not only is the
6073      optimized realignment scheme not applicable, but also the misalignment
6074      computation (and generation of the realignment token that is passed to
6075      REALIGN_LOAD) have to be done inside the loop.
6076
6077      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6078      or not, which in turn determines if the misalignment is computed inside
6079      the inner-loop, or outside LOOP.  */
6080
6081   if (init_addr != NULL_TREE || !loop_vinfo)
6082     {
6083       compute_in_loop = true;
6084       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6085     }
6086
6087
6088   /* 2. Determine where to generate the extra vector load.
6089
6090      For the optimized realignment scheme, instead of generating two vector
6091      loads in each iteration, we generate a single extra vector load in the
6092      preheader of the loop, and in each iteration reuse the result of the
6093      vector load from the previous iteration.  In case the memory access is in
6094      an inner-loop nested inside LOOP, which is now being vectorized using
6095      outer-loop vectorization, we need to determine whether this initial vector
6096      load should be generated at the preheader of the inner-loop, or can be
6097      generated at the preheader of LOOP.  If the memory access has no evolution
6098      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6099      to be generated inside LOOP (in the preheader of the inner-loop).  */
6100
6101   if (nested_in_vect_loop)
6102     {
6103       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6104       bool invariant_in_outerloop =
6105             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6106       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6107     }
6108   else
6109     loop_for_initial_load = loop;
6110   if (at_loop)
6111     *at_loop = loop_for_initial_load;
6112
6113   tree vuse = NULL_TREE;
6114   if (loop_for_initial_load)
6115     {
6116       pe = loop_preheader_edge (loop_for_initial_load);
6117       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6118         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6119     }
6120   if (!vuse)
6121     vuse = gimple_vuse (gsi_stmt (*gsi));
6122
6123   /* 3. For the case of the optimized realignment, create the first vector
6124       load at the loop preheader.  */
6125
6126   if (alignment_support_scheme == dr_explicit_realign_optimized)
6127     {
6128       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6129       gassign *new_stmt;
6130
6131       gcc_assert (!compute_in_loop);
6132       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6133       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6134                                       loop_for_initial_load, NULL_TREE,
6135                                       &init_addr, NULL, &inc, true);
6136       if (TREE_CODE (ptr) == SSA_NAME)
6137         new_temp = copy_ssa_name (ptr);
6138       else
6139         new_temp = make_ssa_name (TREE_TYPE (ptr));
6140       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6141       tree type = TREE_TYPE (ptr);
6142       new_stmt = gimple_build_assign
6143                    (new_temp, BIT_AND_EXPR, ptr,
6144                     fold_build2 (MINUS_EXPR, type,
6145                                  build_int_cst (type, 0),
6146                                  build_int_cst (type, align)));
6147       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6148       gcc_assert (!new_bb);
6149       data_ref
6150         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6151                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6152       vect_copy_ref_info (data_ref, DR_REF (dr));
6153       new_stmt = gimple_build_assign (vec_dest, data_ref);
6154       new_temp = make_ssa_name (vec_dest, new_stmt);
6155       gimple_assign_set_lhs (new_stmt, new_temp);
6156       gimple_set_vuse (new_stmt, vuse);
6157       if (pe)
6158         {
6159           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6160           gcc_assert (!new_bb);
6161         }
6162       else
6163          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6164
6165       msq_init = gimple_assign_lhs (new_stmt);
6166     }
6167
6168   /* 4. Create realignment token using a target builtin, if available.
6169       It is done either inside the containing loop, or before LOOP (as
6170       determined above).  */
6171
6172   if (targetm.vectorize.builtin_mask_for_load)
6173     {
6174       gcall *new_stmt;
6175       tree builtin_decl;
6176
6177       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6178       if (!init_addr)
6179         {
6180           /* Generate the INIT_ADDR computation outside LOOP.  */
6181           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6182                                                             stmt_info, &stmts,
6183                                                             NULL_TREE);
6184           if (loop)
6185             {
6186               pe = loop_preheader_edge (loop);
6187               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6188               gcc_assert (!new_bb);
6189             }
6190           else
6191              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6192         }
6193
6194       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6195       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6196       vec_dest =
6197         vect_create_destination_var (scalar_dest,
6198                                      gimple_call_return_type (new_stmt));
6199       new_temp = make_ssa_name (vec_dest, new_stmt);
6200       gimple_call_set_lhs (new_stmt, new_temp);
6201
6202       if (compute_in_loop)
6203         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6204       else
6205         {
6206           /* Generate the misalignment computation outside LOOP.  */
6207           pe = loop_preheader_edge (loop);
6208           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6209           gcc_assert (!new_bb);
6210         }
6211
6212       *realignment_token = gimple_call_lhs (new_stmt);
6213
6214       /* The result of the CALL_EXPR to this builtin is determined from
6215          the value of the parameter and no global variables are touched
6216          which makes the builtin a "const" function.  Requiring the
6217          builtin to have the "const" attribute makes it unnecessary
6218          to call mark_call_clobbered.  */
6219       gcc_assert (TREE_READONLY (builtin_decl));
6220     }
6221
6222   if (alignment_support_scheme == dr_explicit_realign)
6223     return msq;
6224
6225   gcc_assert (!compute_in_loop);
6226   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6227
6228
6229   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6230
6231   pe = loop_preheader_edge (containing_loop);
6232   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6233   msq = make_ssa_name (vec_dest);
6234   phi_stmt = create_phi_node (msq, containing_loop->header);
6235   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6236
6237   return msq;
6238 }
6239
6240
6241 /* Function vect_grouped_load_supported.
6242
6243    COUNT is the size of the load group (the number of statements plus the
6244    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6245    only one statement, with a gap of COUNT - 1.
6246
6247    Returns true if a suitable permute exists.  */
6248
6249 bool
6250 vect_grouped_load_supported (tree vectype, bool single_element_p,
6251                              unsigned HOST_WIDE_INT count)
6252 {
6253   machine_mode mode = TYPE_MODE (vectype);
6254
6255   /* If this is single-element interleaving with an element distance
6256      that leaves unused vector loads around punt - we at least create
6257      very sub-optimal code in that case (and blow up memory,
6258      see PR65518).  */
6259   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6260     {
6261       if (dump_enabled_p ())
6262         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263                          "single-element interleaving not supported "
6264                          "for not adjacent vector loads\n");
6265       return false;
6266     }
6267
6268   /* vect_permute_load_chain requires the group size to be equal to 3 or
6269      be a power of two.  */
6270   if (count != 3 && exact_log2 (count) == -1)
6271     {
6272       if (dump_enabled_p ())
6273         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274                          "the size of the group of accesses"
6275                          " is not a power of 2 or not equal to 3\n");
6276       return false;
6277     }
6278
6279   /* Check that the permutation is supported.  */
6280   if (VECTOR_MODE_P (mode))
6281     {
6282       unsigned int i, j;
6283       if (count == 3)
6284         {
6285           unsigned int nelt;
6286           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6287             {
6288               if (dump_enabled_p ())
6289                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290                                  "cannot handle groups of 3 loads for"
6291                                  " variable-length vectors\n");
6292               return false;
6293             }
6294
6295           vec_perm_builder sel (nelt, nelt, 1);
6296           sel.quick_grow (nelt);
6297           vec_perm_indices indices;
6298           unsigned int k;
6299           for (k = 0; k < 3; k++)
6300             {
6301               for (i = 0; i < nelt; i++)
6302                 if (3 * i + k < 2 * nelt)
6303                   sel[i] = 3 * i + k;
6304                 else
6305                   sel[i] = 0;
6306               indices.new_vector (sel, 2, nelt);
6307               if (!can_vec_perm_const_p (mode, mode, indices))
6308                 {
6309                   if (dump_enabled_p ())
6310                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6311                                      "shuffle of 3 loads is not supported by"
6312                                      " target\n");
6313                   return false;
6314                 }
6315               for (i = 0, j = 0; i < nelt; i++)
6316                 if (3 * i + k < 2 * nelt)
6317                   sel[i] = i;
6318                 else
6319                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6320               indices.new_vector (sel, 2, nelt);
6321               if (!can_vec_perm_const_p (mode, mode, indices))
6322                 {
6323                   if (dump_enabled_p ())
6324                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6325                                      "shuffle of 3 loads is not supported by"
6326                                      " target\n");
6327                   return false;
6328                 }
6329             }
6330           return true;
6331         }
6332       else
6333         {
6334           /* If length is not equal to 3 then only power of 2 is supported.  */
6335           gcc_assert (pow2p_hwi (count));
6336           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6337
6338           /* The encoding has a single stepped pattern.  */
6339           vec_perm_builder sel (nelt, 1, 3);
6340           sel.quick_grow (3);
6341           for (i = 0; i < 3; i++)
6342             sel[i] = i * 2;
6343           vec_perm_indices indices (sel, 2, nelt);
6344           if (can_vec_perm_const_p (mode, mode, indices))
6345             {
6346               for (i = 0; i < 3; i++)
6347                 sel[i] = i * 2 + 1;
6348               indices.new_vector (sel, 2, nelt);
6349               if (can_vec_perm_const_p (mode, mode, indices))
6350                 return true;
6351             }
6352         }
6353     }
6354
6355   if (dump_enabled_p ())
6356     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357                      "extract even/odd not supported by target\n");
6358   return false;
6359 }
6360
6361 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6362    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6363
6364 internal_fn
6365 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6366                            bool masked_p)
6367 {
6368   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6369                                     vec_mask_len_load_lanes_optab, vectype,
6370                                     count))
6371     return IFN_MASK_LEN_LOAD_LANES;
6372   else if (masked_p)
6373     {
6374       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6375                                         vec_mask_load_lanes_optab, vectype,
6376                                         count))
6377         return IFN_MASK_LOAD_LANES;
6378     }
6379   else
6380     {
6381       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6382                                         vectype, count))
6383         return IFN_LOAD_LANES;
6384     }
6385   return IFN_LAST;
6386 }
6387
6388 /* Function vect_permute_load_chain.
6389
6390    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6391    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6392    the input data correctly.  Return the final references for loads in
6393    RESULT_CHAIN.
6394
6395    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6396    The input is 4 vectors each containing 8 elements. We assign a number to each
6397    element, the input sequence is:
6398
6399    1st vec:   0  1  2  3  4  5  6  7
6400    2nd vec:   8  9 10 11 12 13 14 15
6401    3rd vec:  16 17 18 19 20 21 22 23
6402    4th vec:  24 25 26 27 28 29 30 31
6403
6404    The output sequence should be:
6405
6406    1st vec:  0 4  8 12 16 20 24 28
6407    2nd vec:  1 5  9 13 17 21 25 29
6408    3rd vec:  2 6 10 14 18 22 26 30
6409    4th vec:  3 7 11 15 19 23 27 31
6410
6411    i.e., the first output vector should contain the first elements of each
6412    interleaving group, etc.
6413
6414    We use extract_even/odd instructions to create such output.  The input of
6415    each extract_even/odd operation is two vectors
6416    1st vec    2nd vec
6417    0 1 2 3    4 5 6 7
6418
6419    and the output is the vector of extracted even/odd elements.  The output of
6420    extract_even will be:   0 2 4 6
6421    and of extract_odd:     1 3 5 7
6422
6423
6424    The permutation is done in log LENGTH stages.  In each stage extract_even
6425    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6426    their order.  In our example,
6427
6428    E1: extract_even (1st vec, 2nd vec)
6429    E2: extract_odd (1st vec, 2nd vec)
6430    E3: extract_even (3rd vec, 4th vec)
6431    E4: extract_odd (3rd vec, 4th vec)
6432
6433    The output for the first stage will be:
6434
6435    E1:  0  2  4  6  8 10 12 14
6436    E2:  1  3  5  7  9 11 13 15
6437    E3: 16 18 20 22 24 26 28 30
6438    E4: 17 19 21 23 25 27 29 31
6439
6440    In order to proceed and create the correct sequence for the next stage (or
6441    for the correct output, if the second stage is the last one, as in our
6442    example), we first put the output of extract_even operation and then the
6443    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6444    The input for the second stage is:
6445
6446    1st vec (E1):  0  2  4  6  8 10 12 14
6447    2nd vec (E3): 16 18 20 22 24 26 28 30
6448    3rd vec (E2):  1  3  5  7  9 11 13 15
6449    4th vec (E4): 17 19 21 23 25 27 29 31
6450
6451    The output of the second stage:
6452
6453    E1: 0 4  8 12 16 20 24 28
6454    E2: 2 6 10 14 18 22 26 30
6455    E3: 1 5  9 13 17 21 25 29
6456    E4: 3 7 11 15 19 23 27 31
6457
6458    And RESULT_CHAIN after reordering:
6459
6460    1st vec (E1):  0 4  8 12 16 20 24 28
6461    2nd vec (E3):  1 5  9 13 17 21 25 29
6462    3rd vec (E2):  2 6 10 14 18 22 26 30
6463    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6464
6465 static void
6466 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6467                          unsigned int length,
6468                          stmt_vec_info stmt_info,
6469                          gimple_stmt_iterator *gsi,
6470                          vec<tree> *result_chain)
6471 {
6472   tree data_ref, first_vect, second_vect;
6473   tree perm_mask_even, perm_mask_odd;
6474   tree perm3_mask_low, perm3_mask_high;
6475   gimple *perm_stmt;
6476   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6477   unsigned int i, j, log_length = exact_log2 (length);
6478
6479   result_chain->quick_grow (length);
6480   memcpy (result_chain->address (), dr_chain.address (),
6481           length * sizeof (tree));
6482
6483   if (length == 3)
6484     {
6485       /* vect_grouped_load_supported ensures that this is constant.  */
6486       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6487       unsigned int k;
6488
6489       vec_perm_builder sel (nelt, nelt, 1);
6490       sel.quick_grow (nelt);
6491       vec_perm_indices indices;
6492       for (k = 0; k < 3; k++)
6493         {
6494           for (i = 0; i < nelt; i++)
6495             if (3 * i + k < 2 * nelt)
6496               sel[i] = 3 * i + k;
6497             else
6498               sel[i] = 0;
6499           indices.new_vector (sel, 2, nelt);
6500           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6501
6502           for (i = 0, j = 0; i < nelt; i++)
6503             if (3 * i + k < 2 * nelt)
6504               sel[i] = i;
6505             else
6506               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6507           indices.new_vector (sel, 2, nelt);
6508           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6509
6510           first_vect = dr_chain[0];
6511           second_vect = dr_chain[1];
6512
6513           /* Create interleaving stmt (low part of):
6514              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6515                                                              ...}>  */
6516           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6517           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6518                                            second_vect, perm3_mask_low);
6519           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6520
6521           /* Create interleaving stmt (high part of):
6522              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6523                                                               ...}>  */
6524           first_vect = data_ref;
6525           second_vect = dr_chain[2];
6526           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6527           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6528                                            second_vect, perm3_mask_high);
6529           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6530           (*result_chain)[k] = data_ref;
6531         }
6532     }
6533   else
6534     {
6535       /* If length is not equal to 3 then only power of 2 is supported.  */
6536       gcc_assert (pow2p_hwi (length));
6537
6538       /* The encoding has a single stepped pattern.  */
6539       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6540       vec_perm_builder sel (nelt, 1, 3);
6541       sel.quick_grow (3);
6542       for (i = 0; i < 3; ++i)
6543         sel[i] = i * 2;
6544       vec_perm_indices indices (sel, 2, nelt);
6545       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6546
6547       for (i = 0; i < 3; ++i)
6548         sel[i] = i * 2 + 1;
6549       indices.new_vector (sel, 2, nelt);
6550       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6551
6552       for (i = 0; i < log_length; i++)
6553         {
6554           for (j = 0; j < length; j += 2)
6555             {
6556               first_vect = dr_chain[j];
6557               second_vect = dr_chain[j+1];
6558
6559               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6560               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6561               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6562                                                first_vect, second_vect,
6563                                                perm_mask_even);
6564               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6565               (*result_chain)[j/2] = data_ref;
6566
6567               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6568               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6569               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6570                                                first_vect, second_vect,
6571                                                perm_mask_odd);
6572               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6573               (*result_chain)[j/2+length/2] = data_ref;
6574             }
6575           memcpy (dr_chain.address (), result_chain->address (),
6576                   length * sizeof (tree));
6577         }
6578     }
6579 }
6580
6581 /* Function vect_shift_permute_load_chain.
6582
6583    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6584    sequence of stmts to reorder the input data accordingly.
6585    Return the final references for loads in RESULT_CHAIN.
6586    Return true if successed, false otherwise.
6587
6588    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6589    The input is 3 vectors each containing 8 elements.  We assign a
6590    number to each element, the input sequence is:
6591
6592    1st vec:   0  1  2  3  4  5  6  7
6593    2nd vec:   8  9 10 11 12 13 14 15
6594    3rd vec:  16 17 18 19 20 21 22 23
6595
6596    The output sequence should be:
6597
6598    1st vec:  0 3 6  9 12 15 18 21
6599    2nd vec:  1 4 7 10 13 16 19 22
6600    3rd vec:  2 5 8 11 14 17 20 23
6601
6602    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6603
6604    First we shuffle all 3 vectors to get correct elements order:
6605
6606    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6607    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6608    3rd vec:  (16 19 22) (17 20 23) (18 21)
6609
6610    Next we unite and shift vector 3 times:
6611
6612    1st step:
6613      shift right by 6 the concatenation of:
6614      "1st vec" and  "2nd vec"
6615        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6616      "2nd vec" and  "3rd vec"
6617        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6618      "3rd vec" and  "1st vec"
6619        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6620                              | New vectors                   |
6621
6622      So that now new vectors are:
6623
6624      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6625      2nd vec:  (10 13) (16 19 22) (17 20 23)
6626      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6627
6628    2nd step:
6629      shift right by 5 the concatenation of:
6630      "1st vec" and  "3rd vec"
6631        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6632      "2nd vec" and  "1st vec"
6633        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6634      "3rd vec" and  "2nd vec"
6635        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6636                           | New vectors                   |
6637
6638      So that now new vectors are:
6639
6640      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6641      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6642      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6643
6644    3rd step:
6645      shift right by 5 the concatenation of:
6646      "1st vec" and  "1st vec"
6647        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6648      shift right by 3 the concatenation of:
6649      "2nd vec" and  "2nd vec"
6650                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6651                           | New vectors                   |
6652
6653      So that now all vectors are READY:
6654      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6655      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6656      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6657
6658    This algorithm is faster than one in vect_permute_load_chain if:
6659      1.  "shift of a concatination" is faster than general permutation.
6660          This is usually so.
6661      2.  The TARGET machine can't execute vector instructions in parallel.
6662          This is because each step of the algorithm depends on previous.
6663          The algorithm in vect_permute_load_chain is much more parallel.
6664
6665    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6666 */
6667
6668 static bool
6669 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6670                                unsigned int length,
6671                                stmt_vec_info stmt_info,
6672                                gimple_stmt_iterator *gsi,
6673                                vec<tree> *result_chain)
6674 {
6675   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6676   tree perm2_mask1, perm2_mask2, perm3_mask;
6677   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6678   gimple *perm_stmt;
6679
6680   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6681   machine_mode vmode = TYPE_MODE (vectype);
6682   unsigned int i;
6683   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6684
6685   unsigned HOST_WIDE_INT nelt, vf;
6686   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6687       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6688     /* Not supported for variable-length vectors.  */
6689     return false;
6690
6691   vec_perm_builder sel (nelt, nelt, 1);
6692   sel.quick_grow (nelt);
6693
6694   result_chain->quick_grow (length);
6695   memcpy (result_chain->address (), dr_chain.address (),
6696           length * sizeof (tree));
6697
6698   if (pow2p_hwi (length) && vf > 4)
6699     {
6700       unsigned int j, log_length = exact_log2 (length);
6701       for (i = 0; i < nelt / 2; ++i)
6702         sel[i] = i * 2;
6703       for (i = 0; i < nelt / 2; ++i)
6704         sel[nelt / 2 + i] = i * 2 + 1;
6705       vec_perm_indices indices (sel, 2, nelt);
6706       if (!can_vec_perm_const_p (vmode, vmode, indices))
6707         {
6708           if (dump_enabled_p ())
6709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710                              "shuffle of 2 fields structure is not \
6711                               supported by target\n");
6712           return false;
6713         }
6714       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6715
6716       for (i = 0; i < nelt / 2; ++i)
6717         sel[i] = i * 2 + 1;
6718       for (i = 0; i < nelt / 2; ++i)
6719         sel[nelt / 2 + i] = i * 2;
6720       indices.new_vector (sel, 2, nelt);
6721       if (!can_vec_perm_const_p (vmode, vmode, indices))
6722         {
6723           if (dump_enabled_p ())
6724             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725                              "shuffle of 2 fields structure is not \
6726                               supported by target\n");
6727           return false;
6728         }
6729       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6730
6731       /* Generating permutation constant to shift all elements.
6732          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6733       for (i = 0; i < nelt; i++)
6734         sel[i] = nelt / 2 + i;
6735       indices.new_vector (sel, 2, nelt);
6736       if (!can_vec_perm_const_p (vmode, vmode, indices))
6737         {
6738           if (dump_enabled_p ())
6739             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740                              "shift permutation is not supported by target\n");
6741           return false;
6742         }
6743       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6744
6745       /* Generating permutation constant to select vector from 2.
6746          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6747       for (i = 0; i < nelt / 2; i++)
6748         sel[i] = i;
6749       for (i = nelt / 2; i < nelt; i++)
6750         sel[i] = nelt + i;
6751       indices.new_vector (sel, 2, nelt);
6752       if (!can_vec_perm_const_p (vmode, vmode, indices))
6753         {
6754           if (dump_enabled_p ())
6755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756                              "select is not supported by target\n");
6757           return false;
6758         }
6759       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6760
6761       for (i = 0; i < log_length; i++)
6762         {
6763           for (j = 0; j < length; j += 2)
6764             {
6765               first_vect = dr_chain[j];
6766               second_vect = dr_chain[j + 1];
6767
6768               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6769               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6770                                                first_vect, first_vect,
6771                                                perm2_mask1);
6772               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6773               vect[0] = data_ref;
6774
6775               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6776               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6777                                                second_vect, second_vect,
6778                                                perm2_mask2);
6779               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6780               vect[1] = data_ref;
6781
6782               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6783               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6784                                                vect[0], vect[1], shift1_mask);
6785               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6786               (*result_chain)[j/2 + length/2] = data_ref;
6787
6788               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6789               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6790                                                vect[0], vect[1], select_mask);
6791               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6792               (*result_chain)[j/2] = data_ref;
6793             }
6794           memcpy (dr_chain.address (), result_chain->address (),
6795                   length * sizeof (tree));
6796         }
6797       return true;
6798     }
6799   if (length == 3 && vf > 2)
6800     {
6801       unsigned int k = 0, l = 0;
6802
6803       /* Generating permutation constant to get all elements in rigth order.
6804          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6805       for (i = 0; i < nelt; i++)
6806         {
6807           if (3 * k + (l % 3) >= nelt)
6808             {
6809               k = 0;
6810               l += (3 - (nelt % 3));
6811             }
6812           sel[i] = 3 * k + (l % 3);
6813           k++;
6814         }
6815       vec_perm_indices indices (sel, 2, nelt);
6816       if (!can_vec_perm_const_p (vmode, vmode, indices))
6817         {
6818           if (dump_enabled_p ())
6819             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820                              "shuffle of 3 fields structure is not \
6821                               supported by target\n");
6822           return false;
6823         }
6824       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6825
6826       /* Generating permutation constant to shift all elements.
6827          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6828       for (i = 0; i < nelt; i++)
6829         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6830       indices.new_vector (sel, 2, nelt);
6831       if (!can_vec_perm_const_p (vmode, vmode, indices))
6832         {
6833           if (dump_enabled_p ())
6834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835                              "shift permutation is not supported by target\n");
6836           return false;
6837         }
6838       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6839
6840       /* Generating permutation constant to shift all elements.
6841          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6842       for (i = 0; i < nelt; i++)
6843         sel[i] = 2 * (nelt / 3) + 1 + i;
6844       indices.new_vector (sel, 2, nelt);
6845       if (!can_vec_perm_const_p (vmode, vmode, indices))
6846         {
6847           if (dump_enabled_p ())
6848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849                              "shift permutation is not supported by target\n");
6850           return false;
6851         }
6852       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6853
6854       /* Generating permutation constant to shift all elements.
6855          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6856       for (i = 0; i < nelt; i++)
6857         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6858       indices.new_vector (sel, 2, nelt);
6859       if (!can_vec_perm_const_p (vmode, vmode, indices))
6860         {
6861           if (dump_enabled_p ())
6862             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6863                              "shift permutation is not supported by target\n");
6864           return false;
6865         }
6866       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6867
6868       /* Generating permutation constant to shift all elements.
6869          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6870       for (i = 0; i < nelt; i++)
6871         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6872       indices.new_vector (sel, 2, nelt);
6873       if (!can_vec_perm_const_p (vmode, vmode, indices))
6874         {
6875           if (dump_enabled_p ())
6876             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6877                              "shift permutation is not supported by target\n");
6878           return false;
6879         }
6880       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6881
6882       for (k = 0; k < 3; k++)
6883         {
6884           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6885           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6886                                            dr_chain[k], dr_chain[k],
6887                                            perm3_mask);
6888           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6889           vect[k] = data_ref;
6890         }
6891
6892       for (k = 0; k < 3; k++)
6893         {
6894           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6895           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6896                                            vect[k % 3], vect[(k + 1) % 3],
6897                                            shift1_mask);
6898           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6899           vect_shift[k] = data_ref;
6900         }
6901
6902       for (k = 0; k < 3; k++)
6903         {
6904           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6905           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6906                                            vect_shift[(4 - k) % 3],
6907                                            vect_shift[(3 - k) % 3],
6908                                            shift2_mask);
6909           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6910           vect[k] = data_ref;
6911         }
6912
6913       (*result_chain)[3 - (nelt % 3)] = vect[2];
6914
6915       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6916       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6917                                        vect[0], shift3_mask);
6918       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6919       (*result_chain)[nelt % 3] = data_ref;
6920
6921       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6922       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6923                                        vect[1], shift4_mask);
6924       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6925       (*result_chain)[0] = data_ref;
6926       return true;
6927     }
6928   return false;
6929 }
6930
6931 /* Function vect_transform_grouped_load.
6932
6933    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6934    to perform their permutation and ascribe the result vectorized statements to
6935    the scalar statements.
6936 */
6937
6938 void
6939 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6940                              vec<tree> dr_chain,
6941                              int size, gimple_stmt_iterator *gsi)
6942 {
6943   machine_mode mode;
6944   vec<tree> result_chain = vNULL;
6945
6946   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6947      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6948      vectors, that are ready for vector computation.  */
6949   result_chain.create (size);
6950
6951   /* If reassociation width for vector type is 2 or greater target machine can
6952      execute 2 or more vector instructions in parallel.  Otherwise try to
6953      get chain for loads group using vect_shift_permute_load_chain.  */
6954   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6955   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6956       || pow2p_hwi (size)
6957       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6958                                          gsi, &result_chain))
6959     vect_permute_load_chain (vinfo, dr_chain,
6960                              size, stmt_info, gsi, &result_chain);
6961   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6962   result_chain.release ();
6963 }
6964
6965 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6966    generated as part of the vectorization of STMT_INFO.  Assign the statement
6967    for each vector to the associated scalar statement.  */
6968
6969 void
6970 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6971                                   vec<tree> result_chain)
6972 {
6973   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6974   unsigned int i, gap_count;
6975   tree tmp_data_ref;
6976
6977   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6978      Since we scan the chain starting from it's first node, their order
6979      corresponds the order of data-refs in RESULT_CHAIN.  */
6980   stmt_vec_info next_stmt_info = first_stmt_info;
6981   gap_count = 1;
6982   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6983     {
6984       if (!next_stmt_info)
6985         break;
6986
6987       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6988        code elimination pass later.  No need to check for the first stmt in
6989        the group, since it always exists.
6990        DR_GROUP_GAP is the number of steps in elements from the previous
6991        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6992        correspond to the gaps.  */
6993       if (next_stmt_info != first_stmt_info
6994           && gap_count < DR_GROUP_GAP (next_stmt_info))
6995         {
6996           gap_count++;
6997           continue;
6998         }
6999
7000       /* ???  The following needs cleanup after the removal of
7001          DR_GROUP_SAME_DR_STMT.  */
7002       if (next_stmt_info)
7003         {
7004           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7005           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7006              copies, and we put the new vector statement last.  */
7007           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7008
7009           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7010           gap_count = 1;
7011         }
7012     }
7013 }
7014
7015 /* Function vect_force_dr_alignment_p.
7016
7017    Returns whether the alignment of a DECL can be forced to be aligned
7018    on ALIGNMENT bit boundary.  */
7019
7020 bool
7021 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7022 {
7023   if (!VAR_P (decl))
7024     return false;
7025
7026   if (decl_in_symtab_p (decl)
7027       && !symtab_node::get (decl)->can_increase_alignment_p ())
7028     return false;
7029
7030   if (TREE_STATIC (decl))
7031     return (known_le (alignment,
7032                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7033   else
7034     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7035 }
7036
7037 /* Return whether the data reference DR_INFO is supported with respect to its
7038    alignment.
7039    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7040    it is aligned, i.e., check if it is possible to vectorize it with different
7041    alignment.  */
7042
7043 enum dr_alignment_support
7044 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7045                                tree vectype, int misalignment)
7046 {
7047   data_reference *dr = dr_info->dr;
7048   stmt_vec_info stmt_info = dr_info->stmt;
7049   machine_mode mode = TYPE_MODE (vectype);
7050   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7051   class loop *vect_loop = NULL;
7052   bool nested_in_vect_loop = false;
7053
7054   if (misalignment == 0)
7055     return dr_aligned;
7056
7057   /* For now assume all conditional loads/stores support unaligned
7058      access without any special code.  */
7059   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7060     if (gimple_call_internal_p (stmt)
7061         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7062             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7063       return dr_unaligned_supported;
7064
7065   if (loop_vinfo)
7066     {
7067       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7068       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7069     }
7070
7071   /* Possibly unaligned access.  */
7072
7073   /* We can choose between using the implicit realignment scheme (generating
7074      a misaligned_move stmt) and the explicit realignment scheme (generating
7075      aligned loads with a REALIGN_LOAD).  There are two variants to the
7076      explicit realignment scheme: optimized, and unoptimized.
7077      We can optimize the realignment only if the step between consecutive
7078      vector loads is equal to the vector size.  Since the vector memory
7079      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7080      is guaranteed that the misalignment amount remains the same throughout the
7081      execution of the vectorized loop.  Therefore, we can create the
7082      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7083      at the loop preheader.
7084
7085      However, in the case of outer-loop vectorization, when vectorizing a
7086      memory access in the inner-loop nested within the LOOP that is now being
7087      vectorized, while it is guaranteed that the misalignment of the
7088      vectorized memory access will remain the same in different outer-loop
7089      iterations, it is *not* guaranteed that is will remain the same throughout
7090      the execution of the inner-loop.  This is because the inner-loop advances
7091      with the original scalar step (and not in steps of VS).  If the inner-loop
7092      step happens to be a multiple of VS, then the misalignment remains fixed
7093      and we can use the optimized realignment scheme.  For example:
7094
7095       for (i=0; i<N; i++)
7096         for (j=0; j<M; j++)
7097           s += a[i+j];
7098
7099      When vectorizing the i-loop in the above example, the step between
7100      consecutive vector loads is 1, and so the misalignment does not remain
7101      fixed across the execution of the inner-loop, and the realignment cannot
7102      be optimized (as illustrated in the following pseudo vectorized loop):
7103
7104       for (i=0; i<N; i+=4)
7105         for (j=0; j<M; j++){
7106           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7107                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7108                          // (assuming that we start from an aligned address).
7109           }
7110
7111      We therefore have to use the unoptimized realignment scheme:
7112
7113       for (i=0; i<N; i+=4)
7114           for (j=k; j<M; j+=4)
7115           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7116                            // that the misalignment of the initial address is
7117                            // 0).
7118
7119      The loop can then be vectorized as follows:
7120
7121       for (k=0; k<4; k++){
7122         rt = get_realignment_token (&vp[k]);
7123         for (i=0; i<N; i+=4){
7124           v1 = vp[i+k];
7125           for (j=k; j<M; j+=4){
7126             v2 = vp[i+j+VS-1];
7127             va = REALIGN_LOAD <v1,v2,rt>;
7128             vs += va;
7129             v1 = v2;
7130           }
7131         }
7132     } */
7133
7134   if (DR_IS_READ (dr))
7135     {
7136       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7137           && (!targetm.vectorize.builtin_mask_for_load
7138               || targetm.vectorize.builtin_mask_for_load ()))
7139         {
7140           /* If we are doing SLP then the accesses need not have the
7141              same alignment, instead it depends on the SLP group size.  */
7142           if (loop_vinfo
7143               && STMT_SLP_TYPE (stmt_info)
7144               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7145                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7146                                   * (DR_GROUP_SIZE
7147                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7148                                   TYPE_VECTOR_SUBPARTS (vectype))))
7149             ;
7150           else if (!loop_vinfo
7151                    || (nested_in_vect_loop
7152                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7153                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7154             return dr_explicit_realign;
7155           else
7156             return dr_explicit_realign_optimized;
7157         }
7158     }
7159
7160   bool is_packed = false;
7161   tree type = TREE_TYPE (DR_REF (dr));
7162   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7163     is_packed = not_size_aligned (DR_REF (dr));
7164   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7165                                                      is_packed))
7166     return dr_unaligned_supported;
7167
7168   /* Unsupported.  */
7169   return dr_unaligned_unsupported;
7170 }