gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 236      SLP group are emitted at the position of the last scalar load and
 237      all loads in an interleaving group are emitted at the position
 238      of the first scalar load.
 239      Stores in a group are emitted at the position of the last scalar store.
 240      Compute that position and check whether the resulting order matches
 241      the current one.
 242      We have not yet decided between SLP and interleaving so we have
 243      to conservatively assume both.  */
 244   stmt_vec_info il_a;
 245   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (last_a)
 247     {
 248       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 249            s = DR_GROUP_NEXT_ELEMENT (s))
 250         last_a = get_later_stmt (last_a, s);
 251       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 252         {
 253           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254                s = DR_GROUP_NEXT_ELEMENT (s))
 255             if (get_later_stmt (il_a, s) == il_a)
 256               il_a = s;
 257         }
 258       else
 259         il_a = last_a;
 260     }
 261   else
 262     last_a = il_a = stmtinfo_a;
 263   stmt_vec_info il_b;
 264   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 265   if (last_b)
 266     {
 267       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 268            s = DR_GROUP_NEXT_ELEMENT (s))
 269         last_b = get_later_stmt (last_b, s);
 270       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 271         {
 272           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 273                s = DR_GROUP_NEXT_ELEMENT (s))
 274             if (get_later_stmt (il_b, s) == il_b)
 275               il_b = s;
 276         }
 277       else
 278         il_b = last_b;
 279     }
 280   else
 281     last_b = il_b = stmtinfo_b;
 282   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 283   return (/* SLP */
 284           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 285           /* Interleaving */
 286           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 287           /* Mixed */
 288           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 289           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 290 }
 291
 292 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 293    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 294    distances.  These distances are conservatively correct but they don't
 295    reflect a guaranteed dependence.
 296
 297    Return true if this function does all the work necessary to avoid
 298    an alias or false if the caller should use the dependence distances
 299    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 300    the depth of the loop described by LOOP_VINFO and the other arguments
 301    are as for vect_analyze_data_ref_dependence.  */
 302
 303 static bool
 304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 305                                        loop_vec_info loop_vinfo,
 306                                        int loop_depth, unsigned int *max_vf)
 307 {
 308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   lambda_vector dist_v;
 310   unsigned int i;
 311   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 312     {
 313       int dist = dist_v[loop_depth];
 314       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 315         {
 316           /* If the user asserted safelen >= DIST consecutive iterations
 317              can be executed concurrently, assume independence.
 318
 319              ??? An alternative would be to add the alias check even
 320              in this case, and vectorize the fallback loop with the
 321              maximum VF set to safelen.  However, if the user has
 322              explicitly given a length, it's less likely that that
 323              would be a win.  */
 324           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 325             {
 326               if ((unsigned int) loop->safelen < *max_vf)
 327                 *max_vf = loop->safelen;
 328               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 329               continue;
 330             }
 331
 332           /* For dependence distances of 2 or more, we have the option
 333              of limiting VF or checking for an alias at runtime.
 334              Prefer to check at runtime if we can, to avoid limiting
 335              the VF unnecessarily when the bases are in fact independent.
 336
 337              Note that the alias checks will be removed if the VF ends up
 338              being small enough.  */
 339           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 340           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 341           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 342                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 343                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 344         }
 345     }
 346   return true;
 347 }
 348
 349
 350 /* Function vect_analyze_data_ref_dependence.
 351
 352    FIXME: I needed to change the sense of the returned flag.
 353
 354    Return FALSE if there (might) exist a dependence between a memory-reference
 355    DRA and a memory-reference DRB.  When versioning for alias may check a
 356    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 357    the data dependence.  */
 358
 359 static opt_result
 360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 361                                   loop_vec_info loop_vinfo,
 362                                   unsigned int *max_vf)
 363 {
 364   unsigned int i;
 365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 366   struct data_reference *dra = DDR_A (ddr);
 367   struct data_reference *drb = DDR_B (ddr);
 368   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 369   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 370   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 371   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 372   lambda_vector dist_v;
 373   unsigned int loop_depth;
 374
 375   /* In loop analysis all data references should be vectorizable.  */
 376   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 377       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 378     gcc_unreachable ();
 379
 380   /* Independent data accesses.  */
 381   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 382     return opt_result::success ();
 383
 384   if (dra == drb
 385       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 386     return opt_result::success ();
 387
 388   /* We do not have to consider dependences between accesses that belong
 389      to the same group, unless the stride could be smaller than the
 390      group size.  */
 391   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 394       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 395     return opt_result::success ();
 396
 397   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 398      least two scalar iterations, there is always also a true dependence.
 399      As the vectorizer does not re-order loads and stores we can ignore
 400      the anti-dependence if TBAA can disambiguate both DRs similar to the
 401      case with known negative distance anti-dependences (positive
 402      distance anti-dependences would violate TBAA constraints).  */
 403   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 404        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 405       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 406                                  get_alias_set (DR_REF (drb))))
 407     return opt_result::success ();
 408
 409   /* Unknown data dependence.  */
 410   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 411     {
 412       /* If user asserted safelen consecutive iterations can be
 413          executed concurrently, assume independence.  */
 414       if (loop->safelen >= 2)
 415         {
 416           if ((unsigned int) loop->safelen < *max_vf)
 417             *max_vf = loop->safelen;
 418           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 419           return opt_result::success ();
 420         }
 421
 422       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 423           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 424         return opt_result::failure_at
 425           (stmtinfo_a->stmt,
 426            "versioning for alias not supported for: "
 427            "can't determine dependence between %T and %T\n",
 428            DR_REF (dra), DR_REF (drb));
 429
 430       if (dump_enabled_p ())
 431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 432                          "versioning for alias required: "
 433                          "can't determine dependence between %T and %T\n",
 434                          DR_REF (dra), DR_REF (drb));
 435
 436       /* Add to list of ddrs that need to be tested at run-time.  */
 437       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 438     }
 439
 440   /* Known data dependence.  */
 441   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 442     {
 443       /* If user asserted safelen consecutive iterations can be
 444          executed concurrently, assume independence.  */
 445       if (loop->safelen >= 2)
 446         {
 447           if ((unsigned int) loop->safelen < *max_vf)
 448             *max_vf = loop->safelen;
 449           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 450           return opt_result::success ();
 451         }
 452
 453       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455         return opt_result::failure_at
 456           (stmtinfo_a->stmt,
 457            "versioning for alias not supported for: "
 458            "bad dist vector for %T and %T\n",
 459            DR_REF (dra), DR_REF (drb));
 460
 461       if (dump_enabled_p ())
 462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 463                          "versioning for alias required: "
 464                          "bad dist vector for %T and %T\n",
 465                          DR_REF (dra), DR_REF (drb));
 466       /* Add to list of ddrs that need to be tested at run-time.  */
 467       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 468     }
 469
 470   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 471
 472   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 473       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 474                                                 loop_depth, max_vf))
 475     return opt_result::success ();
 476
 477   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 478     {
 479       int dist = dist_v[loop_depth];
 480
 481       if (dump_enabled_p ())
 482         dump_printf_loc (MSG_NOTE, vect_location,
 483                          "dependence distance  = %d.\n", dist);
 484
 485       if (dist == 0)
 486         {
 487           if (dump_enabled_p ())
 488             dump_printf_loc (MSG_NOTE, vect_location,
 489                              "dependence distance == 0 between %T and %T\n",
 490                              DR_REF (dra), DR_REF (drb));
 491
 492           /* When we perform grouped accesses and perform implicit CSE
 493              by detecting equal accesses and doing disambiguation with
 494              runtime alias tests like for
 495                 .. = a[i];
 496                 .. = a[i+1];
 497                 a[i] = ..;
 498                 a[i+1] = ..;
 499                 *p = ..;
 500                 .. = a[i];
 501                 .. = a[i+1];
 502              where we will end up loading { a[i], a[i+1] } once, make
 503              sure that inserting group loads before the first load and
 504              stores after the last store will do the right thing.
 505              Similar for groups like
 506                 a[i] = ...;
 507                 ... = a[i];
 508                 a[i+1] = ...;
 509              where loads from the group interleave with the store.  */
 510           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 511             return opt_result::failure_at (stmtinfo_a->stmt,
 512                                            "READ_WRITE dependence"
 513                                            " in interleaving.\n");
 514
 515           if (loop->safelen < 2)
 516             {
 517               tree indicator = dr_zero_step_indicator (dra);
 518               if (!indicator || integer_zerop (indicator))
 519                 return opt_result::failure_at (stmtinfo_a->stmt,
 520                                                "access also has a zero step\n");
 521               else if (TREE_CODE (indicator) != INTEGER_CST)
 522                 vect_check_nonzero_value (loop_vinfo, indicator);
 523             }
 524           continue;
 525         }
 526
 527       if (dist > 0 && DDR_REVERSED_P (ddr))
 528         {
 529           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 530              reversed (to make distance vector positive), and the actual
 531              distance is negative.  */
 532           if (dump_enabled_p ())
 533             dump_printf_loc (MSG_NOTE, vect_location,
 534                              "dependence distance negative.\n");
 535           /* When doing outer loop vectorization, we need to check if there is
 536              a backward dependence at the inner loop level if the dependence
 537              at the outer loop is reversed.  See PR81740.  */
 538           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 539               || nested_in_vect_loop_p (loop, stmtinfo_b))
 540             {
 541               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 542                                                          DDR_LOOP_NEST (ddr));
 543               if (dist_v[inner_depth] < 0)
 544                 return opt_result::failure_at (stmtinfo_a->stmt,
 545                                                "not vectorized, dependence "
 546                                                "between data-refs %T and %T\n",
 547                                                DR_REF (dra), DR_REF (drb));
 548             }
 549           /* Record a negative dependence distance to later limit the
 550              amount of stmt copying / unrolling we can perform.
 551              Only need to handle read-after-write dependence.  */
 552           if (DR_IS_READ (drb)
 553               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 554                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 555             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 556           continue;
 557         }
 558
 559       unsigned int abs_dist = abs (dist);
 560       if (abs_dist >= 2 && abs_dist < *max_vf)
 561         {
 562           /* The dependence distance requires reduction of the maximal
 563              vectorization factor.  */
 564           *max_vf = abs_dist;
 565           if (dump_enabled_p ())
 566             dump_printf_loc (MSG_NOTE, vect_location,
 567                              "adjusting maximal vectorization factor to %i\n",
 568                              *max_vf);
 569         }
 570
 571       if (abs_dist >= *max_vf)
 572         {
 573           /* Dependence distance does not create dependence, as far as
 574              vectorization is concerned, in this case.  */
 575           if (dump_enabled_p ())
 576             dump_printf_loc (MSG_NOTE, vect_location,
 577                              "dependence distance >= VF.\n");
 578           continue;
 579         }
 580
 581       return opt_result::failure_at (stmtinfo_a->stmt,
 582                                      "not vectorized, possible dependence "
 583                                      "between data-refs %T and %T\n",
 584                                      DR_REF (dra), DR_REF (drb));
 585     }
 586
 587   return opt_result::success ();
 588 }
 589
 590 /* Function vect_analyze_data_ref_dependences.
 591
 592    Examine all the data references in the loop, and make sure there do not
 593    exist any data dependences between them.  Set *MAX_VF according to
 594    the maximum vectorization factor the data dependences allow.  */
 595
 596 opt_result
 597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 598                                    unsigned int *max_vf)
 599 {
 600   unsigned int i;
 601   struct data_dependence_relation *ddr;
 602
 603   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 604
 605   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 606     {
 607       LOOP_VINFO_DDRS (loop_vinfo)
 608         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 609                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 610       /* We need read-read dependences to compute
 611          STMT_VINFO_SAME_ALIGN_REFS.  */
 612       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 613                                           &LOOP_VINFO_DDRS (loop_vinfo),
 614                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 615                                           true);
 616       gcc_assert (res);
 617     }
 618
 619   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 620
 621   /* For epilogues we either have no aliases or alias versioning
 622      was applied to original loop.  Therefore we may just get max_vf
 623      using VF of original loop.  */
 624   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 625     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 626   else
 627     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 628       {
 629         opt_result res
 630           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 631         if (!res)
 632           return res;
 633       }
 634
 635   return opt_result::success ();
 636 }
 637
 638
 639 /* Function vect_slp_analyze_data_ref_dependence.
 640
 641    Return TRUE if there (might) exist a dependence between a memory-reference
 642    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 643    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 644    according to the data dependence.  */
 645
 646 static bool
 647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 648                                       struct data_dependence_relation *ddr)
 649 {
 650   struct data_reference *dra = DDR_A (ddr);
 651   struct data_reference *drb = DDR_B (ddr);
 652   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 653   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 654
 655   /* We need to check dependences of statements marked as unvectorizable
 656      as well, they still can prohibit vectorization.  */
 657
 658   /* Independent data accesses.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 660     return false;
 661
 662   if (dra == drb)
 663     return false;
 664
 665   /* Read-read is OK.  */
 666   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 667     return false;
 668
 669   /* If dra and drb are part of the same interleaving chain consider
 670      them independent.  */
 671   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 672       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 673           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 674     return false;
 675
 676   /* Unknown data dependence.  */
 677   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 678     {
 679       if  (dump_enabled_p ())
 680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 681                          "can't determine dependence between %T and %T\n",
 682                          DR_REF (dra), DR_REF (drb));
 683     }
 684   else if (dump_enabled_p ())
 685     dump_printf_loc (MSG_NOTE, vect_location,
 686                      "determined dependence between %T and %T\n",
 687                      DR_REF (dra), DR_REF (drb));
 688
 689   return true;
 690 }
 691
 692
 693 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 694    contain the vector of scalar stores of this instance if we are
 695    disambiguating the loads.  */
 696
 697 static bool
 698 vect_slp_analyze_node_dependences (vec_info *vinfo,
 699                                    slp_instance instance, slp_tree node,
 700                                    vec<stmt_vec_info> stores,
 701                                    stmt_vec_info last_store_info)
 702 {
 703   /* This walks over all stmts involved in the SLP load/store done
 704      in NODE verifying we can sink them up to the last stmt in the
 705      group.  */
 706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 707   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 708     {
 709       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 710       if (access_info == last_access_info)
 711         continue;
 712       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 713       ao_ref ref;
 714       bool ref_initialized_p = false;
 715       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 716            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 717         {
 718           gimple *stmt = gsi_stmt (gsi);
 719           if (! gimple_vuse (stmt)
 720               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 721             continue;
 722
 723           /* If we couldn't record a (single) data reference for this
 724              stmt we have to resort to the alias oracle.  */
 725           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 726           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 727           if (!dr_b)
 728             {
 729               /* We are moving a store or sinking a load - this means
 730                  we cannot use TBAA for disambiguation.  */
 731               if (!ref_initialized_p)
 732                 ao_ref_init (&ref, DR_REF (dr_a));
 733               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 734                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 735                 return false;
 736               continue;
 737             }
 738
 739           bool dependent = false;
 740           /* If we run into a store of this same instance (we've just
 741              marked those) then delay dependence checking until we run
 742              into the last store because this is where it will have
 743              been sunk to (and we verify if we can do that as well).  */
 744           if (gimple_visited_p (stmt))
 745             {
 746               if (stmt_info != last_store_info)
 747                 continue;
 748               unsigned i;
 749               stmt_vec_info store_info;
 750               FOR_EACH_VEC_ELT (stores, i, store_info)
 751                 {
 752                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 753                   ddr_p ddr = initialize_data_dependence_relation
 754                                 (dr_a, store_dr, vNULL);
 755                   dependent
 756                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 757                   free_dependence_relation (ddr);
 758                   if (dependent)
 759                     break;
 760                 }
 761             }
 762           else
 763             {
 764               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 765                                                                dr_b, vNULL);
 766               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 767               free_dependence_relation (ddr);
 768             }
 769           if (dependent)
 770             return false;
 771         }
 772     }
 773   return true;
 774 }
 775
 776
 777 /* Function vect_analyze_data_ref_dependences.
 778
 779    Examine all the data references in the basic-block, and make sure there
 780    do not exist any data dependences between them.  Set *MAX_VF according to
 781    the maximum vectorization factor the data dependences allow.  */
 782
 783 bool
 784 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 785 {
 786   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 787
 788   /* The stores of this instance are at the root of the SLP tree.  */
 789   slp_tree store = SLP_INSTANCE_TREE (instance);
 790   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 791     store = NULL;
 792
 793   /* Verify we can sink stores to the vectorized stmt insert location.  */
 794   stmt_vec_info last_store_info = NULL;
 795   if (store)
 796     {
 797       if (! vect_slp_analyze_node_dependences (vinfo, instance, store,
 798                                                vNULL, NULL))
 799         return false;
 800
 801       /* Mark stores in this instance and remember the last one.  */
 802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 804         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 805     }
 806
 807   bool res = true;
 808
 809   /* Verify we can sink loads to the vectorized stmt insert location,
 810      special-casing stores of this instance.  */
 811   slp_tree load;
 812   unsigned int i;
 813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 814     if (! vect_slp_analyze_node_dependences (vinfo, instance, load,
 815                                              store
 816                                              ? SLP_TREE_SCALAR_STMTS (store)
 817                                              : vNULL, last_store_info))
 818       {
 819         res = false;
 820         break;
 821       }
 822
 823   /* Unset the visited flag.  */
 824   if (store)
 825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 827
 828   return res;
 829 }
 830
 831 /* Record the base alignment guarantee given by DRB, which occurs
 832    in STMT_INFO.  */
 833
 834 static void
 835 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 836                             innermost_loop_behavior *drb)
 837 {
 838   bool existed;
 839   innermost_loop_behavior *&entry
 840     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 841   if (!existed || entry->base_alignment < drb->base_alignment)
 842     {
 843       entry = drb;
 844       if (dump_enabled_p ())
 845         dump_printf_loc (MSG_NOTE, vect_location,
 846                          "recording new base alignment for %T\n"
 847                          "  alignment:    %d\n"
 848                          "  misalignment: %d\n"
 849                          "  based on:     %G",
 850                          drb->base_address,
 851                          drb->base_alignment,
 852                          drb->base_misalignment,
 853                          stmt_info->stmt);
 854     }
 855 }
 856
 857 /* If the region we're going to vectorize is reached, all unconditional
 858    data references occur at least once.  We can therefore pool the base
 859    alignment guarantees from each unconditional reference.  Do this by
 860    going through all the data references in VINFO and checking whether
 861    the containing statement makes the reference unconditionally.  If so,
 862    record the alignment of the base address in VINFO so that it can be
 863    used for all other references with the same base.  */
 864
 865 void
 866 vect_record_base_alignments (vec_info *vinfo)
 867 {
 868   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 869   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 870   data_reference *dr;
 871   unsigned int i;
 872   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 873     {
 874       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 875       stmt_vec_info stmt_info = dr_info->stmt;
 876       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 877           && STMT_VINFO_VECTORIZABLE (stmt_info)
 878           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 879         {
 880           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 881
 882           /* If DR is nested in the loop that is being vectorized, we can also
 883              record the alignment of the base wrt the outer loop.  */
 884           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 885             vect_record_base_alignment
 886               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 887         }
 888     }
 889 }
 890
 891 /* Return the target alignment for the vectorized form of DR_INFO.  */
 892
 893 static poly_uint64
 894 vect_calculate_target_alignment (dr_vec_info *dr_info)
 895 {
 896   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 897   return targetm.vectorize.preferred_vector_alignment (vectype);
 898 }
 899
 900 /* Function vect_compute_data_ref_alignment
 901
 902    Compute the misalignment of the data reference DR_INFO.
 903
 904    Output:
 905    1. DR_MISALIGNMENT (DR_INFO) is defined.
 906
 907    FOR NOW: No analysis is actually performed. Misalignment is calculated
 908    only for trivial cases. TODO.  */
 909
 910 static void
 911 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 912 {
 913   stmt_vec_info stmt_info = dr_info->stmt;
 914   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 915   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 916   class loop *loop = NULL;
 917   tree ref = DR_REF (dr_info->dr);
 918   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 919
 920   if (dump_enabled_p ())
 921     dump_printf_loc (MSG_NOTE, vect_location,
 922                      "vect_compute_data_ref_alignment:\n");
 923
 924   if (loop_vinfo)
 925     loop = LOOP_VINFO_LOOP (loop_vinfo);
 926
 927   /* Initialize misalignment to unknown.  */
 928   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 929
 930   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 931     return;
 932
 933   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 934   bool step_preserves_misalignment_p;
 935
 936   poly_uint64 vector_alignment
 937     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 938   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 939
 940   /* If the main loop has peeled for alignment we have no way of knowing
 941      whether the data accesses in the epilogues are aligned.  We can't at
 942      compile time answer the question whether we have entered the main loop or
 943      not.  Fixes PR 92351.  */
 944   if (loop_vinfo)
 945     {
 946       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
 947       if (orig_loop_vinfo
 948           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
 949         return;
 950     }
 951
 952   unsigned HOST_WIDE_INT vect_align_c;
 953   if (!vector_alignment.is_constant (&vect_align_c))
 954     return;
 955
 956   /* No step for BB vectorization.  */
 957   if (!loop)
 958     {
 959       gcc_assert (integer_zerop (drb->step));
 960       step_preserves_misalignment_p = true;
 961     }
 962
 963   /* In case the dataref is in an inner-loop of the loop that is being
 964      vectorized (LOOP), we use the base and misalignment information
 965      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 966      stays the same throughout the execution of the inner-loop, which is why
 967      we have to check that the stride of the dataref in the inner-loop evenly
 968      divides by the vector alignment.  */
 969   else if (nested_in_vect_loop_p (loop, stmt_info))
 970     {
 971       step_preserves_misalignment_p
 972         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 973
 974       if (dump_enabled_p ())
 975         {
 976           if (step_preserves_misalignment_p)
 977             dump_printf_loc (MSG_NOTE, vect_location,
 978                              "inner step divides the vector alignment.\n");
 979           else
 980             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 981                              "inner step doesn't divide the vector"
 982                              " alignment.\n");
 983         }
 984     }
 985
 986   /* Similarly we can only use base and misalignment information relative to
 987      an innermost loop if the misalignment stays the same throughout the
 988      execution of the loop.  As above, this is the case if the stride of
 989      the dataref evenly divides by the alignment.  */
 990   else
 991     {
 992       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993       step_preserves_misalignment_p
 994         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 995
 996       if (!step_preserves_misalignment_p && dump_enabled_p ())
 997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 998                          "step doesn't divide the vector alignment.\n");
 999     }
1000
1001   unsigned int base_alignment = drb->base_alignment;
1002   unsigned int base_misalignment = drb->base_misalignment;
1003
1004   /* Calculate the maximum of the pooled base address alignment and the
1005      alignment that we can compute for DR itself.  */
1006   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007   if (entry && base_alignment < (*entry)->base_alignment)
1008     {
1009       base_alignment = (*entry)->base_alignment;
1010       base_misalignment = (*entry)->base_misalignment;
1011     }
1012
1013   if (drb->offset_alignment < vect_align_c
1014       || !step_preserves_misalignment_p
1015       /* We need to know whether the step wrt the vectorized loop is
1016          negative when computing the starting misalignment below.  */
1017       || TREE_CODE (drb->step) != INTEGER_CST)
1018     {
1019       if (dump_enabled_p ())
1020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                          "Unknown alignment for access: %T\n", ref);
1022       return;
1023     }
1024
1025   if (base_alignment < vect_align_c)
1026     {
1027       unsigned int max_alignment;
1028       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029       if (max_alignment < vect_align_c
1030           || !vect_can_force_dr_alignment_p (base,
1031                                              vect_align_c * BITS_PER_UNIT))
1032         {
1033           if (dump_enabled_p ())
1034             dump_printf_loc (MSG_NOTE, vect_location,
1035                              "can't force alignment of ref: %T\n", ref);
1036           return;
1037         }
1038
1039       /* Force the alignment of the decl.
1040          NOTE: This is the only change to the code we make during
1041          the analysis phase, before deciding to vectorize the loop.  */
1042       if (dump_enabled_p ())
1043         dump_printf_loc (MSG_NOTE, vect_location,
1044                          "force alignment of %T\n", ref);
1045
1046       dr_info->base_decl = base;
1047       dr_info->base_misaligned = true;
1048       base_misalignment = 0;
1049     }
1050   poly_int64 misalignment
1051     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052
1053   /* If this is a backward running DR then first access in the larger
1054      vectype actually is N-1 elements before the address in the DR.
1055      Adjust misalign accordingly.  */
1056   if (tree_int_cst_sgn (drb->step) < 0)
1057     /* PLUS because STEP is negative.  */
1058     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059                      * TREE_INT_CST_LOW (drb->step));
1060
1061   unsigned int const_misalignment;
1062   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063     {
1064       if (dump_enabled_p ())
1065         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066                          "Non-constant misalignment for access: %T\n", ref);
1067       return;
1068     }
1069
1070   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071
1072   if (dump_enabled_p ())
1073     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074                      "misalign = %d bytes of ref %T\n",
1075                      DR_MISALIGNMENT (dr_info), ref);
1076
1077   return;
1078 }
1079
1080 /* Function vect_update_misalignment_for_peel.
1081    Sets DR_INFO's misalignment
1082    - to 0 if it has the same alignment as DR_PEEL_INFO,
1083    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084    - to -1 (unknown) otherwise.
1085
1086    DR_INFO - the data reference whose misalignment is to be adjusted.
1087    DR_PEEL_INFO - the data reference whose misalignment is being made
1088                   zero in the vector loop by the peel.
1089    NPEEL - the number of iterations in the peel loop if the misalignment
1090            of DR_PEEL_INFO is known at compile time.  */
1091
1092 static void
1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094                                    dr_vec_info *dr_peel_info, int npeel)
1095 {
1096   unsigned int i;
1097   vec<dr_p> same_aligned_drs;
1098   struct data_reference *current_dr;
1099   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100
1101   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102      it is aligned in the vector loop.  */
1103   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105     {
1106       if (current_dr != dr_info->dr)
1107         continue;
1108       gcc_assert (!known_alignment_for_access_p (dr_info)
1109                   || !known_alignment_for_access_p (dr_peel_info)
1110                   || (DR_MISALIGNMENT (dr_info)
1111                       == DR_MISALIGNMENT (dr_peel_info)));
1112       SET_DR_MISALIGNMENT (dr_info, 0);
1113       return;
1114     }
1115
1116   unsigned HOST_WIDE_INT alignment;
1117   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118       && known_alignment_for_access_p (dr_info)
1119       && known_alignment_for_access_p (dr_peel_info))
1120     {
1121       int misal = DR_MISALIGNMENT (dr_info);
1122       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123       misal &= alignment - 1;
1124       SET_DR_MISALIGNMENT (dr_info, misal);
1125       return;
1126     }
1127
1128   if (dump_enabled_p ())
1129     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130                      "to unknown (-1).\n");
1131   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132 }
1133
1134
1135 /* Function verify_data_ref_alignment
1136
1137    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1138
1139 static opt_result
1140 verify_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
1141 {
1142   enum dr_alignment_support supportable_dr_alignment
1143     = vect_supportable_dr_alignment (vinfo, dr_info, false);
1144   if (!supportable_dr_alignment)
1145     return opt_result::failure_at
1146       (dr_info->stmt->stmt,
1147        DR_IS_READ (dr_info->dr)
1148         ? "not vectorized: unsupported unaligned load: %T\n"
1149         : "not vectorized: unsupported unaligned store: %T\n",
1150        DR_REF (dr_info->dr));
1151
1152   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153     dump_printf_loc (MSG_NOTE, vect_location,
1154                      "Vectorizing an unaligned access.\n");
1155
1156   return opt_result::success ();
1157 }
1158
1159 /* Function vect_verify_datarefs_alignment
1160
1161    Return TRUE if all data references in the loop can be
1162    handled with respect to alignment.  */
1163
1164 opt_result
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166 {
1167   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168   struct data_reference *dr;
1169   unsigned int i;
1170
1171   FOR_EACH_VEC_ELT (datarefs, i, dr)
1172     {
1173       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174       stmt_vec_info stmt_info = dr_info->stmt;
1175
1176       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177         continue;
1178
1179       /* For interleaving, only the alignment of the first access matters.   */
1180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182         continue;
1183
1184       /* Strided accesses perform only component accesses, alignment is
1185          irrelevant for them.  */
1186       if (STMT_VINFO_STRIDED_P (stmt_info)
1187           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188         continue;
1189
1190       opt_result res = verify_data_ref_alignment (vinfo, dr_info);
1191       if (!res)
1192         return res;
1193     }
1194
1195   return opt_result::success ();
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199    than its size.  */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205     return true;
1206
1207   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208           > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213    Return true if vector alignment for DR_INFO is reachable by peeling
1214    a few loop iterations.  Return false otherwise.  */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223     {
1224       /* For interleaved access we peel only if number of iterations in
1225          the prolog loop ({VF - misalignment}), is a multiple of the
1226          number of the interleaved accesses.  */
1227       int elem_size, mis_in_elements;
1228
1229       /* FORNOW: handle only known alignment.  */
1230       if (!known_alignment_for_access_p (dr_info))
1231         return false;
1232
1233       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235       elem_size = vector_element_size (vector_size, nelements);
1236       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239         return false;
1240     }
1241
1242   /* If misalignment is known at the compile time then allow peeling
1243      only if natural alignment is reachable through peeling.  */
1244   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245     {
1246       HOST_WIDE_INT elmsize =
1247                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248       if (dump_enabled_p ())
1249         {
1250           dump_printf_loc (MSG_NOTE, vect_location,
1251                            "data size = %wd. misalignment = %d.\n", elmsize,
1252                            DR_MISALIGNMENT (dr_info));
1253         }
1254       if (DR_MISALIGNMENT (dr_info) % elmsize)
1255         {
1256           if (dump_enabled_p ())
1257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                              "data size does not divide the misalignment.\n");
1259           return false;
1260         }
1261     }
1262
1263   if (!known_alignment_for_access_p (dr_info))
1264     {
1265       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267       if (dump_enabled_p ())
1268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                          "Unknown misalignment, %snaturally aligned\n",
1270                          is_packed ? "not " : "");
1271       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272     }
1273
1274   return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO.  */
1279
1280 static void
1281 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1282                            unsigned int *inside_cost,
1283                            unsigned int *outside_cost,
1284                            stmt_vector_for_cost *body_cost_vec,
1285                            stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287   stmt_vec_info stmt_info = dr_info->stmt;
1288   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1289   int ncopies;
1290
1291   if (PURE_SLP_STMT (stmt_info))
1292     ncopies = 1;
1293   else
1294     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296   if (DR_IS_READ (dr_info->dr))
1297     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1298                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1299   else
1300     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302   if (dump_enabled_p ())
1303     dump_printf_loc (MSG_NOTE, vect_location,
1304                      "vect_get_data_access_cost: inside_cost = %d, "
1305                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311   dr_vec_info *dr_info;
1312   int npeel;
1313   unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318   vec_info *vinfo;
1319   struct _vect_peel_info peel_info;
1320   unsigned int inside_cost;
1321   unsigned int outside_cost;
1322 } *vect_peel_extended_info;
1323
1324
1325 /* Peeling hashtable helpers.  */
1326
1327 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1328 {
1329   static inline hashval_t hash (const _vect_peel_info *);
1330   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1331 };
1332
1333 inline hashval_t
1334 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1335 {
1336   return (hashval_t) peel_info->npeel;
1337 }
1338
1339 inline bool
1340 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1341 {
1342   return (a->npeel == b->npeel);
1343 }
1344
1345
1346 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1347
1348 static void
1349 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1350                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1351                           int npeel)
1352 {
1353   struct _vect_peel_info elem, *slot;
1354   _vect_peel_info **new_slot;
1355   bool supportable_dr_alignment
1356     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1357
1358   elem.npeel = npeel;
1359   slot = peeling_htab->find (&elem);
1360   if (slot)
1361     slot->count++;
1362   else
1363     {
1364       slot = XNEW (struct _vect_peel_info);
1365       slot->npeel = npeel;
1366       slot->dr_info = dr_info;
1367       slot->count = 1;
1368       new_slot = peeling_htab->find_slot (slot, INSERT);
1369       *new_slot = slot;
1370     }
1371
1372   if (!supportable_dr_alignment
1373       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1374     slot->count += VECT_MAX_COST;
1375 }
1376
1377
1378 /* Traverse peeling hash table to find peeling option that aligns maximum
1379    number of data accesses.  */
1380
1381 int
1382 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1383                                      _vect_peel_extended_info *max)
1384 {
1385   vect_peel_info elem = *slot;
1386
1387   if (elem->count > max->peel_info.count
1388       || (elem->count == max->peel_info.count
1389           && max->peel_info.npeel > elem->npeel))
1390     {
1391       max->peel_info.npeel = elem->npeel;
1392       max->peel_info.count = elem->count;
1393       max->peel_info.dr_info = elem->dr_info;
1394     }
1395
1396   return 1;
1397 }
1398
1399 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1400    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1401    we assume DR0_INFO's misalignment will be zero after peeling.  */
1402
1403 static void
1404 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1405                                 dr_vec_info *dr0_info,
1406                                 unsigned int *inside_cost,
1407                                 unsigned int *outside_cost,
1408                                 stmt_vector_for_cost *body_cost_vec,
1409                                 stmt_vector_for_cost *prologue_cost_vec,
1410                                 unsigned int npeel,
1411                                 bool unknown_misalignment)
1412 {
1413   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1414   unsigned i;
1415   data_reference *dr;
1416
1417   FOR_EACH_VEC_ELT (datarefs, i, dr)
1418     {
1419       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1420       stmt_vec_info stmt_info = dr_info->stmt;
1421       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1422         continue;
1423
1424       /* For interleaving, only the alignment of the first access
1425          matters.  */
1426       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1427           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1428         continue;
1429
1430       /* Strided accesses perform only component accesses, alignment is
1431          irrelevant for them.  */
1432       if (STMT_VINFO_STRIDED_P (stmt_info)
1433           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1434         continue;
1435
1436       int save_misalignment;
1437       save_misalignment = DR_MISALIGNMENT (dr_info);
1438       if (npeel == 0)
1439         ;
1440       else if (unknown_misalignment && dr_info == dr0_info)
1441         SET_DR_MISALIGNMENT (dr_info, 0);
1442       else
1443         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1444       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1445                                  body_cost_vec, prologue_cost_vec);
1446       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1447     }
1448 }
1449
1450 /* Traverse peeling hash table and calculate cost for each peeling option.
1451    Find the one with the lowest cost.  */
1452
1453 int
1454 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1455                                    _vect_peel_extended_info *min)
1456 {
1457   vect_peel_info elem = *slot;
1458   int dummy;
1459   unsigned int inside_cost = 0, outside_cost = 0;
1460   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1461   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462                        epilogue_cost_vec;
1463
1464   prologue_cost_vec.create (2);
1465   body_cost_vec.create (2);
1466   epilogue_cost_vec.create (2);
1467
1468   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469                                   &outside_cost, &body_cost_vec,
1470                                   &prologue_cost_vec, elem->npeel, false);
1471
1472   body_cost_vec.release ();
1473
1474   outside_cost += vect_get_known_peeling_cost
1475     (loop_vinfo, elem->npeel, &dummy,
1476      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477      &prologue_cost_vec, &epilogue_cost_vec);
1478
1479   /* Prologue and epilogue costs are added to the target model later.
1480      These costs depend only on the scalar iteration cost, the
1481      number of peeling iterations finally chosen, and the number of
1482      misaligned statements.  So discard the information found here.  */
1483   prologue_cost_vec.release ();
1484   epilogue_cost_vec.release ();
1485
1486   if (inside_cost < min->inside_cost
1487       || (inside_cost == min->inside_cost
1488           && outside_cost < min->outside_cost))
1489     {
1490       min->inside_cost = inside_cost;
1491       min->outside_cost = outside_cost;
1492       min->peel_info.dr_info = elem->dr_info;
1493       min->peel_info.npeel = elem->npeel;
1494       min->peel_info.count = elem->count;
1495     }
1496
1497   return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502    choosing an option with the lowest cost (if cost model is enabled) or the
1503    option that aligns as many accesses as possible.  */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507                                        loop_vec_info loop_vinfo)
1508 {
1509    struct _vect_peel_extended_info res;
1510
1511    res.peel_info.dr_info = NULL;
1512    res.vinfo = loop_vinfo;
1513
1514    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1515      {
1516        res.inside_cost = INT_MAX;
1517        res.outside_cost = INT_MAX;
1518        peeling_htab->traverse <_vect_peel_extended_info *,
1519                                vect_peeling_hash_get_lowest_cost> (&res);
1520      }
1521    else
1522      {
1523        res.peel_info.count = 0;
1524        peeling_htab->traverse <_vect_peel_extended_info *,
1525                                vect_peeling_hash_get_most_frequent> (&res);
1526        res.inside_cost = 0;
1527        res.outside_cost = 0;
1528      }
1529
1530    return res;
1531 }
1532
1533 /* Return true if the new peeling NPEEL is supported.  */
1534
1535 static bool
1536 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1537                           unsigned npeel)
1538 {
1539   unsigned i;
1540   struct data_reference *dr = NULL;
1541   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1542   enum dr_alignment_support supportable_dr_alignment;
1543
1544   /* Ensure that all data refs can be vectorized after the peel.  */
1545   FOR_EACH_VEC_ELT (datarefs, i, dr)
1546     {
1547       int save_misalignment;
1548
1549       if (dr == dr0_info->dr)
1550         continue;
1551
1552       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1553       stmt_vec_info stmt_info = dr_info->stmt;
1554       /* For interleaving, only the alignment of the first access
1555          matters.  */
1556       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1557           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1558         continue;
1559
1560       /* Strided accesses perform only component accesses, alignment is
1561          irrelevant for them.  */
1562       if (STMT_VINFO_STRIDED_P (stmt_info)
1563           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1564         continue;
1565
1566       save_misalignment = DR_MISALIGNMENT (dr_info);
1567       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1568       supportable_dr_alignment
1569         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1570       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1571
1572       if (!supportable_dr_alignment)
1573         return false;
1574     }
1575
1576   return true;
1577 }
1578
1579 /* Function vect_enhance_data_refs_alignment
1580
1581    This pass will use loop versioning and loop peeling in order to enhance
1582    the alignment of data references in the loop.
1583
1584    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1585    original loop is to be vectorized.  Any other loops that are created by
1586    the transformations performed in this pass - are not supposed to be
1587    vectorized.  This restriction will be relaxed.
1588
1589    This pass will require a cost model to guide it whether to apply peeling
1590    or versioning or a combination of the two.  For example, the scheme that
1591    intel uses when given a loop with several memory accesses, is as follows:
1592    choose one memory access ('p') which alignment you want to force by doing
1593    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1594    other accesses are not necessarily aligned, or (2) use loop versioning to
1595    generate one loop in which all accesses are aligned, and another loop in
1596    which only 'p' is necessarily aligned.
1597
1598    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1599    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1600    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1601
1602    Devising a cost model is the most critical aspect of this work.  It will
1603    guide us on which access to peel for, whether to use loop versioning, how
1604    many versions to create, etc.  The cost model will probably consist of
1605    generic considerations as well as target specific considerations (on
1606    powerpc for example, misaligned stores are more painful than misaligned
1607    loads).
1608
1609    Here are the general steps involved in alignment enhancements:
1610
1611      -- original loop, before alignment analysis:
1612         for (i=0; i<N; i++){
1613           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1614           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1615         }
1616
1617      -- After vect_compute_data_refs_alignment:
1618         for (i=0; i<N; i++){
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1621         }
1622
1623      -- Possibility 1: we do loop versioning:
1624      if (p is aligned) {
1625         for (i=0; i<N; i++){    # loop 1A
1626           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1627           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1628         }
1629      }
1630      else {
1631         for (i=0; i<N; i++){    # loop 1B
1632           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1633           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1634         }
1635      }
1636
1637      -- Possibility 2: we do loop peeling:
1638      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1639         x = q[i];
1640         p[i] = y;
1641      }
1642      for (i = 3; i < N; i++){   # loop 2A
1643         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1644         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1645      }
1646
1647      -- Possibility 3: combination of loop peeling and versioning:
1648      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1649         x = q[i];
1650         p[i] = y;
1651      }
1652      if (p is aligned) {
1653         for (i = 3; i<N; i++){  # loop 3A
1654           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1655           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1656         }
1657      }
1658      else {
1659         for (i = 3; i<N; i++){  # loop 3B
1660           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1661           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1662         }
1663      }
1664
1665      These loops are later passed to loop_transform to be vectorized.  The
1666      vectorizer will use the alignment information to guide the transformation
1667      (whether to generate regular loads/stores, or with special handling for
1668      misalignment).  */
1669
1670 opt_result
1671 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1672 {
1673   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1675   enum dr_alignment_support supportable_dr_alignment;
1676   dr_vec_info *first_store = NULL;
1677   dr_vec_info *dr0_info = NULL;
1678   struct data_reference *dr;
1679   unsigned int i, j;
1680   bool do_peeling = false;
1681   bool do_versioning = false;
1682   unsigned int npeel = 0;
1683   bool one_misalignment_known = false;
1684   bool one_misalignment_unknown = false;
1685   bool one_dr_unsupportable = false;
1686   dr_vec_info *unsupportable_dr_info = NULL;
1687   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688   unsigned possible_npeel_number = 1;
1689   tree vectype;
1690   unsigned int mis, same_align_drs_max = 0;
1691   hash_table<peel_info_hasher> peeling_htab (1);
1692
1693   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1694
1695   /* Reset data so we can safely be called multiple times.  */
1696   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1698
1699   /* While cost model enhancements are expected in the future, the high level
1700      view of the code at this time is as follows:
1701
1702      A) If there is a misaligned access then see if peeling to align
1703         this access can make all data references satisfy
1704         vect_supportable_dr_alignment.  If so, update data structures
1705         as needed and return true.
1706
1707      B) If peeling wasn't possible and there is a data reference with an
1708         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709         then see if loop versioning checks can be used to make all data
1710         references satisfy vect_supportable_dr_alignment.  If so, update
1711         data structures as needed and return true.
1712
1713      C) If neither peeling nor versioning were successful then return false if
1714         any data reference does not satisfy vect_supportable_dr_alignment.
1715
1716      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1717
1718      Note, Possibility 3 above (which is peeling and versioning together) is not
1719      being done at this time.  */
1720
1721   /* (1) Peeling to force alignment.  */
1722
1723   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724      Considerations:
1725      + How many accesses will become aligned due to the peeling
1726      - How many accesses will become unaligned due to the peeling,
1727        and the cost of misaligned accesses.
1728      - The cost of peeling (the extra runtime checks, the increase
1729        in code size).  */
1730
1731   FOR_EACH_VEC_ELT (datarefs, i, dr)
1732     {
1733       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1734       stmt_vec_info stmt_info = dr_info->stmt;
1735
1736       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737         continue;
1738
1739       /* For interleaving, only the alignment of the first access
1740          matters.  */
1741       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1743         continue;
1744
1745       /* For scatter-gather or invariant accesses there is nothing
1746          to enhance.  */
1747       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748           || integer_zerop (DR_STEP (dr)))
1749         continue;
1750
1751       /* Strided accesses perform only component accesses, alignment is
1752          irrelevant for them.  */
1753       if (STMT_VINFO_STRIDED_P (stmt_info)
1754           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755         continue;
1756
1757       supportable_dr_alignment
1758         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1759       do_peeling = vector_alignment_reachable_p (dr_info);
1760       if (do_peeling)
1761         {
1762           if (known_alignment_for_access_p (dr_info))
1763             {
1764               unsigned int npeel_tmp = 0;
1765               bool negative = tree_int_cst_compare (DR_STEP (dr),
1766                                                     size_zero_node) < 0;
1767
1768               vectype = STMT_VINFO_VECTYPE (stmt_info);
1769               /* If known_alignment_for_access_p then we have set
1770                  DR_MISALIGNMENT which is only done if we know it at compiler
1771                  time, so it is safe to assume target alignment is constant.
1772                */
1773               unsigned int target_align =
1774                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1775               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1776               mis = (negative
1777                      ? DR_MISALIGNMENT (dr_info)
1778                      : -DR_MISALIGNMENT (dr_info));
1779               if (DR_MISALIGNMENT (dr_info) != 0)
1780                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1781
1782               /* For multiple types, it is possible that the bigger type access
1783                  will have more than one peeling option.  E.g., a loop with two
1784                  types: one of size (vector size / 4), and the other one of
1785                  size (vector size / 8).  Vectorization factor will 8.  If both
1786                  accesses are misaligned by 3, the first one needs one scalar
1787                  iteration to be aligned, and the second one needs 5.  But the
1788                  first one will be aligned also by peeling 5 scalar
1789                  iterations, and in that case both accesses will be aligned.
1790                  Hence, except for the immediate peeling amount, we also want
1791                  to try to add full vector size, while we don't exceed
1792                  vectorization factor.
1793                  We do this automatically for cost model, since we calculate
1794                  cost for every peeling option.  */
1795               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1796                 {
1797                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1798                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1799                   possible_npeel_number
1800                     = vect_get_num_vectors (nscalars, vectype);
1801
1802                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1803                      allow peeling NELEMENTS.  */
1804                   if (DR_MISALIGNMENT (dr_info) == 0)
1805                     possible_npeel_number++;
1806                 }
1807
1808               /* Save info about DR in the hash table.  Also include peeling
1809                  amounts according to the explanation above.  */
1810               for (j = 0; j < possible_npeel_number; j++)
1811                 {
1812                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1813                                             dr_info, npeel_tmp);
1814                   npeel_tmp += target_align / dr_size;
1815                 }
1816
1817               one_misalignment_known = true;
1818             }
1819           else
1820             {
1821               /* If we don't know any misalignment values, we prefer
1822                  peeling for data-ref that has the maximum number of data-refs
1823                  with the same alignment, unless the target prefers to align
1824                  stores over load.  */
1825               unsigned same_align_drs
1826                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1827               if (!dr0_info
1828                   || same_align_drs_max < same_align_drs)
1829                 {
1830                   same_align_drs_max = same_align_drs;
1831                   dr0_info = dr_info;
1832                 }
1833               /* For data-refs with the same number of related
1834                  accesses prefer the one where the misalign
1835                  computation will be invariant in the outermost loop.  */
1836               else if (same_align_drs_max == same_align_drs)
1837                 {
1838                   class loop *ivloop0, *ivloop;
1839                   ivloop0 = outermost_invariant_loop_for_expr
1840                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1841                   ivloop = outermost_invariant_loop_for_expr
1842                     (loop, DR_BASE_ADDRESS (dr));
1843                   if ((ivloop && !ivloop0)
1844                       || (ivloop && ivloop0
1845                           && flow_loop_nested_p (ivloop, ivloop0)))
1846                     dr0_info = dr_info;
1847                 }
1848
1849               one_misalignment_unknown = true;
1850
1851               /* Check for data refs with unsupportable alignment that
1852                  can be peeled.  */
1853               if (!supportable_dr_alignment)
1854               {
1855                 one_dr_unsupportable = true;
1856                 unsupportable_dr_info = dr_info;
1857               }
1858
1859               if (!first_store && DR_IS_WRITE (dr))
1860                 first_store = dr_info;
1861             }
1862         }
1863       else
1864         {
1865           if (!aligned_access_p (dr_info))
1866             {
1867               if (dump_enabled_p ())
1868                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869                                  "vector alignment may not be reachable\n");
1870               break;
1871             }
1872         }
1873     }
1874
1875   /* Check if we can possibly peel the loop.  */
1876   if (!vect_can_advance_ivs_p (loop_vinfo)
1877       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1878       || loop->inner)
1879     do_peeling = false;
1880
1881   struct _vect_peel_extended_info peel_for_known_alignment;
1882   struct _vect_peel_extended_info peel_for_unknown_alignment;
1883   struct _vect_peel_extended_info best_peel;
1884
1885   peel_for_unknown_alignment.inside_cost = INT_MAX;
1886   peel_for_unknown_alignment.outside_cost = INT_MAX;
1887   peel_for_unknown_alignment.peel_info.count = 0;
1888
1889   if (do_peeling
1890       && one_misalignment_unknown)
1891     {
1892       /* Check if the target requires to prefer stores over loads, i.e., if
1893          misaligned stores are more expensive than misaligned loads (taking
1894          drs with same alignment into account).  */
1895       unsigned int load_inside_cost = 0;
1896       unsigned int load_outside_cost = 0;
1897       unsigned int store_inside_cost = 0;
1898       unsigned int store_outside_cost = 0;
1899       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1900
1901       stmt_vector_for_cost dummy;
1902       dummy.create (2);
1903       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1904                                       &load_inside_cost,
1905                                       &load_outside_cost,
1906                                       &dummy, &dummy, estimated_npeels, true);
1907       dummy.release ();
1908
1909       if (first_store)
1910         {
1911           dummy.create (2);
1912           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1913                                           &store_inside_cost,
1914                                           &store_outside_cost,
1915                                           &dummy, &dummy,
1916                                           estimated_npeels, true);
1917           dummy.release ();
1918         }
1919       else
1920         {
1921           store_inside_cost = INT_MAX;
1922           store_outside_cost = INT_MAX;
1923         }
1924
1925       if (load_inside_cost > store_inside_cost
1926           || (load_inside_cost == store_inside_cost
1927               && load_outside_cost > store_outside_cost))
1928         {
1929           dr0_info = first_store;
1930           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1931           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1932         }
1933       else
1934         {
1935           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1936           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1937         }
1938
1939       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1940       prologue_cost_vec.create (2);
1941       epilogue_cost_vec.create (2);
1942
1943       int dummy2;
1944       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1945         (loop_vinfo, estimated_npeels, &dummy2,
1946          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1947          &prologue_cost_vec, &epilogue_cost_vec);
1948
1949       prologue_cost_vec.release ();
1950       epilogue_cost_vec.release ();
1951
1952       peel_for_unknown_alignment.peel_info.count = 1
1953         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1954     }
1955
1956   peel_for_unknown_alignment.peel_info.npeel = 0;
1957   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1958
1959   best_peel = peel_for_unknown_alignment;
1960
1961   peel_for_known_alignment.inside_cost = INT_MAX;
1962   peel_for_known_alignment.outside_cost = INT_MAX;
1963   peel_for_known_alignment.peel_info.count = 0;
1964   peel_for_known_alignment.peel_info.dr_info = NULL;
1965
1966   if (do_peeling && one_misalignment_known)
1967     {
1968       /* Peeling is possible, but there is no data access that is not supported
1969          unless aligned.  So we try to choose the best possible peeling from
1970          the hash table.  */
1971       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1972         (&peeling_htab, loop_vinfo);
1973     }
1974
1975   /* Compare costs of peeling for known and unknown alignment. */
1976   if (peel_for_known_alignment.peel_info.dr_info != NULL
1977       && peel_for_unknown_alignment.inside_cost
1978       >= peel_for_known_alignment.inside_cost)
1979     {
1980       best_peel = peel_for_known_alignment;
1981
1982       /* If the best peeling for known alignment has NPEEL == 0, perform no
1983          peeling at all except if there is an unsupportable dr that we can
1984          align.  */
1985       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1986         do_peeling = false;
1987     }
1988
1989   /* If there is an unsupportable data ref, prefer this over all choices so far
1990      since we'd have to discard a chosen peeling except when it accidentally
1991      aligned the unsupportable data ref.  */
1992   if (one_dr_unsupportable)
1993     dr0_info = unsupportable_dr_info;
1994   else if (do_peeling)
1995     {
1996       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1997          TODO: Use nopeel_outside_cost or get rid of it?  */
1998       unsigned nopeel_inside_cost = 0;
1999       unsigned nopeel_outside_cost = 0;
2000
2001       stmt_vector_for_cost dummy;
2002       dummy.create (2);
2003       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2004                                       &nopeel_outside_cost, &dummy, &dummy,
2005                                       0, false);
2006       dummy.release ();
2007
2008       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2009          costs will be recorded.  */
2010       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2011       prologue_cost_vec.create (2);
2012       epilogue_cost_vec.create (2);
2013
2014       int dummy2;
2015       nopeel_outside_cost += vect_get_known_peeling_cost
2016         (loop_vinfo, 0, &dummy2,
2017          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2018          &prologue_cost_vec, &epilogue_cost_vec);
2019
2020       prologue_cost_vec.release ();
2021       epilogue_cost_vec.release ();
2022
2023       npeel = best_peel.peel_info.npeel;
2024       dr0_info = best_peel.peel_info.dr_info;
2025
2026       /* If no peeling is not more expensive than the best peeling we
2027          have so far, don't perform any peeling.  */
2028       if (nopeel_inside_cost <= best_peel.inside_cost)
2029         do_peeling = false;
2030     }
2031
2032   if (do_peeling)
2033     {
2034       stmt_vec_info stmt_info = dr0_info->stmt;
2035       vectype = STMT_VINFO_VECTYPE (stmt_info);
2036
2037       if (known_alignment_for_access_p (dr0_info))
2038         {
2039           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2040                                                 size_zero_node) < 0;
2041           if (!npeel)
2042             {
2043               /* Since it's known at compile time, compute the number of
2044                  iterations in the peeled loop (the peeling factor) for use in
2045                  updating DR_MISALIGNMENT values.  The peeling factor is the
2046                  vectorization factor minus the misalignment as an element
2047                  count.  */
2048               mis = (negative
2049                      ? DR_MISALIGNMENT (dr0_info)
2050                      : -DR_MISALIGNMENT (dr0_info));
2051               /* If known_alignment_for_access_p then we have set
2052                  DR_MISALIGNMENT which is only done if we know it at compiler
2053                  time, so it is safe to assume target alignment is constant.
2054                */
2055               unsigned int target_align =
2056                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2057               npeel = ((mis & (target_align - 1))
2058                        / vect_get_scalar_dr_size (dr0_info));
2059             }
2060
2061           /* For interleaved data access every iteration accesses all the
2062              members of the group, therefore we divide the number of iterations
2063              by the group size.  */
2064           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2065             npeel /= DR_GROUP_SIZE (stmt_info);
2066
2067           if (dump_enabled_p ())
2068             dump_printf_loc (MSG_NOTE, vect_location,
2069                              "Try peeling by %d\n", npeel);
2070         }
2071
2072       /* Ensure that all datarefs can be vectorized after the peel.  */
2073       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2074         do_peeling = false;
2075
2076       /* Check if all datarefs are supportable and log.  */
2077       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2078         {
2079           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2080           if (!stat)
2081             do_peeling = false;
2082           else
2083             return stat;
2084         }
2085
2086       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2087       if (do_peeling)
2088         {
2089           unsigned max_allowed_peel
2090             = param_vect_max_peeling_for_alignment;
2091           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2092             max_allowed_peel = 0;
2093           if (max_allowed_peel != (unsigned)-1)
2094             {
2095               unsigned max_peel = npeel;
2096               if (max_peel == 0)
2097                 {
2098                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2099                   unsigned HOST_WIDE_INT target_align_c;
2100                   if (target_align.is_constant (&target_align_c))
2101                     max_peel =
2102                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2103                   else
2104                     {
2105                       do_peeling = false;
2106                       if (dump_enabled_p ())
2107                         dump_printf_loc (MSG_NOTE, vect_location,
2108                           "Disable peeling, max peels set and vector"
2109                           " alignment unknown\n");
2110                     }
2111                 }
2112               if (max_peel > max_allowed_peel)
2113                 {
2114                   do_peeling = false;
2115                   if (dump_enabled_p ())
2116                     dump_printf_loc (MSG_NOTE, vect_location,
2117                         "Disable peeling, max peels reached: %d\n", max_peel);
2118                 }
2119             }
2120         }
2121
2122       /* Cost model #2 - if peeling may result in a remaining loop not
2123          iterating enough to be vectorized then do not peel.  Since this
2124          is a cost heuristic rather than a correctness decision, use the
2125          most likely runtime value for variable vectorization factors.  */
2126       if (do_peeling
2127           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2128         {
2129           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2130           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2131           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2132               < assumed_vf + max_peel)
2133             do_peeling = false;
2134         }
2135
2136       if (do_peeling)
2137         {
2138           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2139              If the misalignment of DR_i is identical to that of dr0 then set
2140              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2141              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2142              by the peeling factor times the element size of DR_i (MOD the
2143              vectorization factor times the size).  Otherwise, the
2144              misalignment of DR_i must be set to unknown.  */
2145           FOR_EACH_VEC_ELT (datarefs, i, dr)
2146             if (dr != dr0_info->dr)
2147               {
2148                 /* Strided accesses perform only component accesses, alignment
2149                    is irrelevant for them.  */
2150                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2151                 stmt_info = dr_info->stmt;
2152                 if (STMT_VINFO_STRIDED_P (stmt_info)
2153                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2154                   continue;
2155
2156                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2157               }
2158
2159           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2160           if (npeel)
2161             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2162           else
2163             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2164               = DR_MISALIGNMENT (dr0_info);
2165           SET_DR_MISALIGNMENT (dr0_info, 0);
2166           if (dump_enabled_p ())
2167             {
2168               dump_printf_loc (MSG_NOTE, vect_location,
2169                                "Alignment of access forced using peeling.\n");
2170               dump_printf_loc (MSG_NOTE, vect_location,
2171                                "Peeling for alignment will be applied.\n");
2172             }
2173
2174           /* The inside-loop cost will be accounted for in vectorizable_load
2175              and vectorizable_store correctly with adjusted alignments.
2176              Drop the body_cst_vec on the floor here.  */
2177           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2178           gcc_assert (stat);
2179           return stat;
2180         }
2181     }
2182
2183   /* (2) Versioning to force alignment.  */
2184
2185   /* Try versioning if:
2186      1) optimize loop for speed and the cost-model is not cheap
2187      2) there is at least one unsupported misaligned data ref with an unknown
2188         misalignment, and
2189      3) all misaligned data refs with a known misalignment are supported, and
2190      4) the number of runtime alignment checks is within reason.  */
2191
2192   do_versioning
2193     = (optimize_loop_nest_for_speed_p (loop)
2194        && !loop->inner /* FORNOW */
2195        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2196
2197   if (do_versioning)
2198     {
2199       FOR_EACH_VEC_ELT (datarefs, i, dr)
2200         {
2201           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2202           stmt_vec_info stmt_info = dr_info->stmt;
2203
2204           /* For interleaving, only the alignment of the first access
2205              matters.  */
2206           if (aligned_access_p (dr_info)
2207               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2208                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2209             continue;
2210
2211           if (STMT_VINFO_STRIDED_P (stmt_info))
2212             {
2213               /* Strided loads perform only component accesses, alignment is
2214                  irrelevant for them.  */
2215               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2216                 continue;
2217               do_versioning = false;
2218               break;
2219             }
2220
2221           supportable_dr_alignment
2222             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2223
2224           if (!supportable_dr_alignment)
2225             {
2226               int mask;
2227               tree vectype;
2228
2229               if (known_alignment_for_access_p (dr_info)
2230                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2231                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2232                 {
2233                   do_versioning = false;
2234                   break;
2235                 }
2236
2237               vectype = STMT_VINFO_VECTYPE (stmt_info);
2238               gcc_assert (vectype);
2239
2240               /* At present we don't support versioning for alignment
2241                  with variable VF, since there's no guarantee that the
2242                  VF is a power of two.  We could relax this if we added
2243                  a way of enforcing a power-of-two size.  */
2244               unsigned HOST_WIDE_INT size;
2245               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2246                 {
2247                   do_versioning = false;
2248                   break;
2249                 }
2250
2251               /* Forcing alignment in the first iteration is no good if
2252                  we don't keep it across iterations.  For now, just disable
2253                  versioning in this case.
2254                  ?? We could actually unroll the loop to achieve the required
2255                  overall step alignment, and forcing the alignment could be
2256                  done by doing some iterations of the non-vectorized loop.  */
2257               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2258                                * DR_STEP_ALIGNMENT (dr),
2259                                DR_TARGET_ALIGNMENT (dr_info)))
2260                 {
2261                   do_versioning = false;
2262                   break;
2263                 }
2264
2265               /* The rightmost bits of an aligned address must be zeros.
2266                  Construct the mask needed for this test.  For example,
2267                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2268                  mask must be 15 = 0xf. */
2269               mask = size - 1;
2270
2271               /* FORNOW: use the same mask to test all potentially unaligned
2272                  references in the loop.  */
2273               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2274                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2275                 {
2276                   do_versioning = false;
2277                   break;
2278                 }
2279
2280               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2281               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2282             }
2283         }
2284
2285       /* Versioning requires at least one misaligned data reference.  */
2286       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2287         do_versioning = false;
2288       else if (!do_versioning)
2289         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2290     }
2291
2292   if (do_versioning)
2293     {
2294       vec<stmt_vec_info> may_misalign_stmts
2295         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2296       stmt_vec_info stmt_info;
2297
2298       /* It can now be assumed that the data references in the statements
2299          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2300          of the loop being vectorized.  */
2301       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2302         {
2303           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2304           SET_DR_MISALIGNMENT (dr_info, 0);
2305           if (dump_enabled_p ())
2306             dump_printf_loc (MSG_NOTE, vect_location,
2307                              "Alignment of access forced using versioning.\n");
2308         }
2309
2310       if (dump_enabled_p ())
2311         dump_printf_loc (MSG_NOTE, vect_location,
2312                          "Versioning for alignment will be applied.\n");
2313
2314       /* Peeling and versioning can't be done together at this time.  */
2315       gcc_assert (! (do_peeling && do_versioning));
2316
2317       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2318       gcc_assert (stat);
2319       return stat;
2320     }
2321
2322   /* This point is reached if neither peeling nor versioning is being done.  */
2323   gcc_assert (! (do_peeling || do_versioning));
2324
2325   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2326   return stat;
2327 }
2328
2329
2330 /* Function vect_find_same_alignment_drs.
2331
2332    Update group and alignment relations in VINFO according to the chosen
2333    vectorization factor.  */
2334
2335 static void
2336 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2337 {
2338   struct data_reference *dra = DDR_A (ddr);
2339   struct data_reference *drb = DDR_B (ddr);
2340   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2341   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2342   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2343   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2344
2345   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2346     return;
2347
2348   if (dra == drb)
2349     return;
2350
2351   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2352       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2353     return;
2354
2355   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2356       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2357       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2358     return;
2359
2360   /* Two references with distance zero have the same alignment.  */
2361   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2362                           - wi::to_poly_offset (DR_INIT (drb)));
2363   if (maybe_ne (diff, 0))
2364     {
2365       /* Get the wider of the two alignments.  */
2366       poly_uint64 align_a =
2367         exact_div (vect_calculate_target_alignment (dr_info_a),
2368                    BITS_PER_UNIT);
2369       poly_uint64 align_b =
2370         exact_div (vect_calculate_target_alignment (dr_info_b),
2371                    BITS_PER_UNIT);
2372       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2373       if (!align_a.is_constant (&align_a_c)
2374           || !align_b.is_constant (&align_b_c))
2375         return;
2376
2377       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2378
2379       /* Require the gap to be a multiple of the larger vector alignment.  */
2380       if (!multiple_p (diff, max_align))
2381         return;
2382     }
2383
2384   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2385   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2386   if (dump_enabled_p ())
2387     dump_printf_loc (MSG_NOTE, vect_location,
2388                      "accesses have the same alignment: %T and %T\n",
2389                      DR_REF (dra), DR_REF (drb));
2390 }
2391
2392
2393 /* Function vect_analyze_data_refs_alignment
2394
2395    Analyze the alignment of the data-references in the loop.
2396    Return FALSE if a data reference is found that cannot be vectorized.  */
2397
2398 opt_result
2399 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2400 {
2401   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2402
2403   /* Mark groups of data references with same alignment using
2404      data dependence information.  */
2405   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2406   struct data_dependence_relation *ddr;
2407   unsigned int i;
2408
2409   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2410     vect_find_same_alignment_drs (vinfo, ddr);
2411
2412   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2413   struct data_reference *dr;
2414
2415   vect_record_base_alignments (vinfo);
2416   FOR_EACH_VEC_ELT (datarefs, i, dr)
2417     {
2418       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2419       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2420         vect_compute_data_ref_alignment (vinfo, dr_info);
2421     }
2422
2423   return opt_result::success ();
2424 }
2425
2426
2427 /* Analyze alignment of DRs of stmts in NODE.  */
2428
2429 static bool
2430 vect_slp_analyze_and_verify_node_alignment (vec_info *vinfo, slp_tree node)
2431 {
2432   /* We vectorize from the first scalar stmt in the node unless
2433      the node is permuted in which case we start from the first
2434      element in the group.  */
2435   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2436   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2437   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2438     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2439
2440   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2441   vect_compute_data_ref_alignment (vinfo, dr_info);
2442   /* For creating the data-ref pointer we need alignment of the
2443      first element anyway.  */
2444   if (dr_info != first_dr_info)
2445     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2446   if (! verify_data_ref_alignment (vinfo, dr_info))
2447     {
2448       if (dump_enabled_p ())
2449         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2450                          "not vectorized: bad data alignment in basic "
2451                          "block.\n");
2452       return false;
2453     }
2454
2455   return true;
2456 }
2457
2458 /* Function vect_slp_analyze_instance_alignment
2459
2460    Analyze the alignment of the data-references in the SLP instance.
2461    Return FALSE if a data reference is found that cannot be vectorized.  */
2462
2463 bool
2464 vect_slp_analyze_and_verify_instance_alignment (vec_info *vinfo,
2465                                                 slp_instance instance)
2466 {
2467   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2468
2469   slp_tree node;
2470   unsigned i;
2471   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2472     if (! vect_slp_analyze_and_verify_node_alignment (vinfo, node))
2473       return false;
2474
2475   node = SLP_INSTANCE_TREE (instance);
2476   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2477       && ! vect_slp_analyze_and_verify_node_alignment
2478              (vinfo, SLP_INSTANCE_TREE (instance)))
2479     return false;
2480
2481   return true;
2482 }
2483
2484
2485 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2486    accesses of legal size, step, etc.  Detect gaps, single element
2487    interleaving, and other special cases. Set grouped access info.
2488    Collect groups of strided stores for further use in SLP analysis.
2489    Worker for vect_analyze_group_access.  */
2490
2491 static bool
2492 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2493 {
2494   data_reference *dr = dr_info->dr;
2495   tree step = DR_STEP (dr);
2496   tree scalar_type = TREE_TYPE (DR_REF (dr));
2497   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2498   stmt_vec_info stmt_info = dr_info->stmt;
2499   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2500   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2501   HOST_WIDE_INT dr_step = -1;
2502   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2503   bool slp_impossible = false;
2504
2505   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2506      size of the interleaving group (including gaps).  */
2507   if (tree_fits_shwi_p (step))
2508     {
2509       dr_step = tree_to_shwi (step);
2510       /* Check that STEP is a multiple of type size.  Otherwise there is
2511          a non-element-sized gap at the end of the group which we
2512          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2513          ???  As we can handle non-constant step fine here we should
2514          simply remove uses of DR_GROUP_GAP between the last and first
2515          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2516          simply not include that gap.  */
2517       if ((dr_step % type_size) != 0)
2518         {
2519           if (dump_enabled_p ())
2520             dump_printf_loc (MSG_NOTE, vect_location,
2521                              "Step %T is not a multiple of the element size"
2522                              " for %T\n",
2523                              step, DR_REF (dr));
2524           return false;
2525         }
2526       groupsize = absu_hwi (dr_step) / type_size;
2527     }
2528   else
2529     groupsize = 0;
2530
2531   /* Not consecutive access is possible only if it is a part of interleaving.  */
2532   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2533     {
2534       /* Check if it this DR is a part of interleaving, and is a single
2535          element of the group that is accessed in the loop.  */
2536
2537       /* Gaps are supported only for loads. STEP must be a multiple of the type
2538          size.  */
2539       if (DR_IS_READ (dr)
2540           && (dr_step % type_size) == 0
2541           && groupsize > 0)
2542         {
2543           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2544           DR_GROUP_SIZE (stmt_info) = groupsize;
2545           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2546           if (dump_enabled_p ())
2547             dump_printf_loc (MSG_NOTE, vect_location,
2548                              "Detected single element interleaving %T"
2549                              " step %T\n",
2550                              DR_REF (dr), step);
2551
2552           return true;
2553         }
2554
2555       if (dump_enabled_p ())
2556         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2557                          "not consecutive access %G", stmt_info->stmt);
2558
2559       if (bb_vinfo)
2560         {
2561           /* Mark the statement as unvectorizable.  */
2562           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2563           return true;
2564         }
2565
2566       if (dump_enabled_p ())
2567         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2568       STMT_VINFO_STRIDED_P (stmt_info) = true;
2569       return true;
2570     }
2571
2572   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2573     {
2574       /* First stmt in the interleaving chain. Check the chain.  */
2575       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2576       struct data_reference *data_ref = dr;
2577       unsigned int count = 1;
2578       tree prev_init = DR_INIT (data_ref);
2579       HOST_WIDE_INT diff, gaps = 0;
2580
2581       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2582       while (next)
2583         {
2584           /* We never have the same DR multiple times.  */
2585           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2586                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2587
2588           data_ref = STMT_VINFO_DATA_REF (next);
2589
2590           /* All group members have the same STEP by construction.  */
2591           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2592
2593           /* Check that the distance between two accesses is equal to the type
2594              size. Otherwise, we have gaps.  */
2595           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2596                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2597           if (diff != 1)
2598             {
2599               /* FORNOW: SLP of accesses with gaps is not supported.  */
2600               slp_impossible = true;
2601               if (DR_IS_WRITE (data_ref))
2602                 {
2603                   if (dump_enabled_p ())
2604                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605                                      "interleaved store with gaps\n");
2606                   return false;
2607                 }
2608
2609               gaps += diff - 1;
2610             }
2611
2612           last_accessed_element += diff;
2613
2614           /* Store the gap from the previous member of the group. If there is no
2615              gap in the access, DR_GROUP_GAP is always 1.  */
2616           DR_GROUP_GAP (next) = diff;
2617
2618           prev_init = DR_INIT (data_ref);
2619           next = DR_GROUP_NEXT_ELEMENT (next);
2620           /* Count the number of data-refs in the chain.  */
2621           count++;
2622         }
2623
2624       if (groupsize == 0)
2625         groupsize = count + gaps;
2626
2627       /* This could be UINT_MAX but as we are generating code in a very
2628          inefficient way we have to cap earlier.  See PR78699 for example.  */
2629       if (groupsize > 4096)
2630         {
2631           if (dump_enabled_p ())
2632             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2633                              "group is too large\n");
2634           return false;
2635         }
2636
2637       /* Check that the size of the interleaving is equal to count for stores,
2638          i.e., that there are no gaps.  */
2639       if (groupsize != count
2640           && !DR_IS_READ (dr))
2641         {
2642           groupsize = count;
2643           STMT_VINFO_STRIDED_P (stmt_info) = true;
2644         }
2645
2646       /* If there is a gap after the last load in the group it is the
2647          difference between the groupsize and the last accessed
2648          element.
2649          When there is no gap, this difference should be 0.  */
2650       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2651
2652       DR_GROUP_SIZE (stmt_info) = groupsize;
2653       if (dump_enabled_p ())
2654         {
2655           dump_printf_loc (MSG_NOTE, vect_location,
2656                            "Detected interleaving ");
2657           if (DR_IS_READ (dr))
2658             dump_printf (MSG_NOTE, "load ");
2659           else if (STMT_VINFO_STRIDED_P (stmt_info))
2660             dump_printf (MSG_NOTE, "strided store ");
2661           else
2662             dump_printf (MSG_NOTE, "store ");
2663           dump_printf (MSG_NOTE, "of size %u\n",
2664                        (unsigned)groupsize);
2665           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2666           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2667           while (next)
2668             {
2669               if (DR_GROUP_GAP (next) != 1)
2670                 dump_printf_loc (MSG_NOTE, vect_location,
2671                                  "\t<gap of %d elements>\n",
2672                                  DR_GROUP_GAP (next) - 1);
2673               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2674               next = DR_GROUP_NEXT_ELEMENT (next);
2675             }
2676           if (DR_GROUP_GAP (stmt_info) != 0)
2677             dump_printf_loc (MSG_NOTE, vect_location,
2678                              "\t<gap of %d elements>\n",
2679                              DR_GROUP_GAP (stmt_info));
2680         }
2681
2682       /* SLP: create an SLP data structure for every interleaving group of
2683          stores for further analysis in vect_analyse_slp.  */
2684       if (DR_IS_WRITE (dr) && !slp_impossible)
2685         {
2686           if (loop_vinfo)
2687             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2688           if (bb_vinfo)
2689             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2690         }
2691     }
2692
2693   return true;
2694 }
2695
2696 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2697    accesses of legal size, step, etc.  Detect gaps, single element
2698    interleaving, and other special cases. Set grouped access info.
2699    Collect groups of strided stores for further use in SLP analysis.  */
2700
2701 static bool
2702 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2703 {
2704   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2705     {
2706       /* Dissolve the group if present.  */
2707       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2708       while (stmt_info)
2709         {
2710           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2711           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2712           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2713           stmt_info = next;
2714         }
2715       return false;
2716     }
2717   return true;
2718 }
2719
2720 /* Analyze the access pattern of the data-reference DR_INFO.
2721    In case of non-consecutive accesses call vect_analyze_group_access() to
2722    analyze groups of accesses.  */
2723
2724 static bool
2725 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2726 {
2727   data_reference *dr = dr_info->dr;
2728   tree step = DR_STEP (dr);
2729   tree scalar_type = TREE_TYPE (DR_REF (dr));
2730   stmt_vec_info stmt_info = dr_info->stmt;
2731   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2732   class loop *loop = NULL;
2733
2734   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2735     return true;
2736
2737   if (loop_vinfo)
2738     loop = LOOP_VINFO_LOOP (loop_vinfo);
2739
2740   if (loop_vinfo && !step)
2741     {
2742       if (dump_enabled_p ())
2743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744                          "bad data-ref access in loop\n");
2745       return false;
2746     }
2747
2748   /* Allow loads with zero step in inner-loop vectorization.  */
2749   if (loop_vinfo && integer_zerop (step))
2750     {
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752       if (!nested_in_vect_loop_p (loop, stmt_info))
2753         return DR_IS_READ (dr);
2754       /* Allow references with zero step for outer loops marked
2755          with pragma omp simd only - it guarantees absence of
2756          loop-carried dependencies between inner loop iterations.  */
2757       if (loop->safelen < 2)
2758         {
2759           if (dump_enabled_p ())
2760             dump_printf_loc (MSG_NOTE, vect_location,
2761                              "zero step in inner loop of nest\n");
2762           return false;
2763         }
2764     }
2765
2766   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2767     {
2768       /* Interleaved accesses are not yet supported within outer-loop
2769         vectorization for references in the inner-loop.  */
2770       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2771
2772       /* For the rest of the analysis we use the outer-loop step.  */
2773       step = STMT_VINFO_DR_STEP (stmt_info);
2774       if (integer_zerop (step))
2775         {
2776           if (dump_enabled_p ())
2777             dump_printf_loc (MSG_NOTE, vect_location,
2778                              "zero step in outer loop.\n");
2779           return DR_IS_READ (dr);
2780         }
2781     }
2782
2783   /* Consecutive?  */
2784   if (TREE_CODE (step) == INTEGER_CST)
2785     {
2786       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2787       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2788           || (dr_step < 0
2789               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2790         {
2791           /* Mark that it is not interleaving.  */
2792           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2793           return true;
2794         }
2795     }
2796
2797   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2798     {
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_NOTE, vect_location,
2801                          "grouped access in outer loop.\n");
2802       return false;
2803     }
2804
2805
2806   /* Assume this is a DR handled by non-constant strided load case.  */
2807   if (TREE_CODE (step) != INTEGER_CST)
2808     return (STMT_VINFO_STRIDED_P (stmt_info)
2809             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2810                 || vect_analyze_group_access (vinfo, dr_info)));
2811
2812   /* Not consecutive access - check if it's a part of interleaving group.  */
2813   return vect_analyze_group_access (vinfo, dr_info);
2814 }
2815
2816 /* Compare two data-references DRA and DRB to group them into chunks
2817    suitable for grouping.  */
2818
2819 static int
2820 dr_group_sort_cmp (const void *dra_, const void *drb_)
2821 {
2822   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2823   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2824   int cmp;
2825
2826   /* Stabilize sort.  */
2827   if (dra == drb)
2828     return 0;
2829
2830   /* DRs in different loops never belong to the same group.  */
2831   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2832   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2833   if (loopa != loopb)
2834     return loopa->num < loopb->num ? -1 : 1;
2835
2836   /* Ordering of DRs according to base.  */
2837   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2838                                DR_BASE_ADDRESS (drb));
2839   if (cmp != 0)
2840     return cmp;
2841
2842   /* And according to DR_OFFSET.  */
2843   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2844   if (cmp != 0)
2845     return cmp;
2846
2847   /* Put reads before writes.  */
2848   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2849     return DR_IS_READ (dra) ? -1 : 1;
2850
2851   /* Then sort after access size.  */
2852   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2853                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2854   if (cmp != 0)
2855     return cmp;
2856
2857   /* And after step.  */
2858   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2859   if (cmp != 0)
2860     return cmp;
2861
2862   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2863   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2864   if (cmp == 0)
2865     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2866   return cmp;
2867 }
2868
2869 /* If OP is the result of a conversion, return the unconverted value,
2870    otherwise return null.  */
2871
2872 static tree
2873 strip_conversion (tree op)
2874 {
2875   if (TREE_CODE (op) != SSA_NAME)
2876     return NULL_TREE;
2877   gimple *stmt = SSA_NAME_DEF_STMT (op);
2878   if (!is_gimple_assign (stmt)
2879       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2880     return NULL_TREE;
2881   return gimple_assign_rhs1 (stmt);
2882 }
2883
2884 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2885    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2886    be grouped in SLP mode.  */
2887
2888 static bool
2889 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2890                    bool allow_slp_p)
2891 {
2892   if (gimple_assign_single_p (stmt1_info->stmt))
2893     return gimple_assign_single_p (stmt2_info->stmt);
2894
2895   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2896   if (call1 && gimple_call_internal_p (call1))
2897     {
2898       /* Check for two masked loads or two masked stores.  */
2899       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2900       if (!call2 || !gimple_call_internal_p (call2))
2901         return false;
2902       internal_fn ifn = gimple_call_internal_fn (call1);
2903       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2904         return false;
2905       if (ifn != gimple_call_internal_fn (call2))
2906         return false;
2907
2908       /* Check that the masks are the same.  Cope with casts of masks,
2909          like those created by build_mask_conversion.  */
2910       tree mask1 = gimple_call_arg (call1, 2);
2911       tree mask2 = gimple_call_arg (call2, 2);
2912       if (!operand_equal_p (mask1, mask2, 0)
2913           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2914         {
2915           mask1 = strip_conversion (mask1);
2916           if (!mask1)
2917             return false;
2918           mask2 = strip_conversion (mask2);
2919           if (!mask2)
2920             return false;
2921           if (!operand_equal_p (mask1, mask2, 0))
2922             return false;
2923         }
2924       return true;
2925     }
2926
2927   return false;
2928 }
2929
2930 /* Function vect_analyze_data_ref_accesses.
2931
2932    Analyze the access pattern of all the data references in the loop.
2933
2934    FORNOW: the only access pattern that is considered vectorizable is a
2935            simple step 1 (consecutive) access.
2936
2937    FORNOW: handle only arrays and pointer accesses.  */
2938
2939 opt_result
2940 vect_analyze_data_ref_accesses (vec_info *vinfo)
2941 {
2942   unsigned int i;
2943   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2944   struct data_reference *dr;
2945
2946   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2947
2948   if (datarefs.is_empty ())
2949     return opt_result::success ();
2950
2951   /* Sort the array of datarefs to make building the interleaving chains
2952      linear.  Don't modify the original vector's order, it is needed for
2953      determining what dependencies are reversed.  */
2954   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2955   datarefs_copy.qsort (dr_group_sort_cmp);
2956   hash_set<stmt_vec_info> to_fixup;
2957
2958   /* Build the interleaving chains.  */
2959   for (i = 0; i < datarefs_copy.length () - 1;)
2960     {
2961       data_reference_p dra = datarefs_copy[i];
2962       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2963       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2964       stmt_vec_info lastinfo = NULL;
2965       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2966           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2967         {
2968           ++i;
2969           continue;
2970         }
2971       for (i = i + 1; i < datarefs_copy.length (); ++i)
2972         {
2973           data_reference_p drb = datarefs_copy[i];
2974           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2975           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2976           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2977               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2978             break;
2979
2980           /* ???  Imperfect sorting (non-compatible types, non-modulo
2981              accesses, same accesses) can lead to a group to be artificially
2982              split here as we don't just skip over those.  If it really
2983              matters we can push those to a worklist and re-iterate
2984              over them.  The we can just skip ahead to the next DR here.  */
2985
2986           /* DRs in a different loop should not be put into the same
2987              interleaving group.  */
2988           if (gimple_bb (DR_STMT (dra))->loop_father
2989               != gimple_bb (DR_STMT (drb))->loop_father)
2990             break;
2991
2992           /* Check that the data-refs have same first location (except init)
2993              and they are both either store or load (not load and store,
2994              not masked loads or stores).  */
2995           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2996               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2997                                         DR_BASE_ADDRESS (drb)) != 0
2998               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2999               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3000             break;
3001
3002           /* Check that the data-refs have the same constant size.  */
3003           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3004           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3005           if (!tree_fits_uhwi_p (sza)
3006               || !tree_fits_uhwi_p (szb)
3007               || !tree_int_cst_equal (sza, szb))
3008             break;
3009
3010           /* Check that the data-refs have the same step.  */
3011           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3012             break;
3013
3014           /* Check the types are compatible.
3015              ???  We don't distinguish this during sorting.  */
3016           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3017                                    TREE_TYPE (DR_REF (drb))))
3018             break;
3019
3020           /* Check that the DR_INITs are compile-time constants.  */
3021           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3022               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3023             break;
3024
3025           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3026              just hold extra information.  */
3027           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3028               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3029               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3030             break;
3031
3032           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3033           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3034           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3035           HOST_WIDE_INT init_prev
3036             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3037           gcc_assert (init_a <= init_b
3038                       && init_a <= init_prev
3039                       && init_prev <= init_b);
3040
3041           /* Do not place the same access in the interleaving chain twice.  */
3042           if (init_b == init_prev)
3043             {
3044               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3045                           < gimple_uid (DR_STMT (drb)));
3046               /* Simply link in duplicates and fix up the chain below.  */
3047             }
3048           else
3049             {
3050               /* If init_b == init_a + the size of the type * k, we have an
3051                  interleaving, and DRA is accessed before DRB.  */
3052               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3053               if (type_size_a == 0
3054                   || (init_b - init_a) % type_size_a != 0)
3055                 break;
3056
3057               /* If we have a store, the accesses are adjacent.  This splits
3058                  groups into chunks we support (we don't support vectorization
3059                  of stores with gaps).  */
3060               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3061                 break;
3062
3063               /* If the step (if not zero or non-constant) is greater than the
3064                  difference between data-refs' inits this splits groups into
3065                  suitable sizes.  */
3066               if (tree_fits_shwi_p (DR_STEP (dra)))
3067                 {
3068                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3069                   if (step != 0 && step <= (init_b - init_a))
3070                     break;
3071                 }
3072             }
3073
3074           if (dump_enabled_p ())
3075             dump_printf_loc (MSG_NOTE, vect_location,
3076                              DR_IS_READ (dra)
3077                              ? "Detected interleaving load %T and %T\n"
3078                              : "Detected interleaving store %T and %T\n",
3079                              DR_REF (dra), DR_REF (drb));
3080
3081           /* Link the found element into the group list.  */
3082           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3083             {
3084               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3085               lastinfo = stmtinfo_a;
3086             }
3087           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3088           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3089           lastinfo = stmtinfo_b;
3090
3091           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3092             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3093
3094           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3095             dump_printf_loc (MSG_NOTE, vect_location,
3096                              "Load suitable for SLP vectorization only.\n");
3097
3098           if (init_b == init_prev
3099               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3100               && dump_enabled_p ())
3101             dump_printf_loc (MSG_NOTE, vect_location,
3102                              "Queuing group with duplicate access for fixup\n");
3103         }
3104     }
3105
3106   /* Fixup groups with duplicate entries by splitting it.  */
3107   while (1)
3108     {
3109       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3110       if (!(it != to_fixup.end ()))
3111         break;
3112       stmt_vec_info grp = *it;
3113       to_fixup.remove (grp);
3114
3115       /* Find the earliest duplicate group member.  */
3116       unsigned first_duplicate = -1u;
3117       stmt_vec_info next, g = grp;
3118       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3119         {
3120           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3121                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3122               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3123             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3124           g = next;
3125         }
3126       if (first_duplicate == -1U)
3127         continue;
3128
3129       /* Then move all stmts after the first duplicate to a new group.
3130          Note this is a heuristic but one with the property that *it
3131          is fixed up completely.  */
3132       g = grp;
3133       stmt_vec_info newgroup = NULL, ng = grp;
3134       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3135         {
3136           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3137             {
3138               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3139               if (!newgroup)
3140                 newgroup = next;
3141               else
3142                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3143               ng = next;
3144               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3145             }
3146           else
3147             g = DR_GROUP_NEXT_ELEMENT (g);
3148         }
3149       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3150
3151       /* Fixup the new group which still may contain duplicates.  */
3152       to_fixup.add (newgroup);
3153     }
3154
3155   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3156     {
3157       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3158       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3159           && !vect_analyze_data_ref_access (vinfo, dr_info))
3160         {
3161           if (dump_enabled_p ())
3162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3163                              "not vectorized: complicated access pattern.\n");
3164
3165           if (is_a <bb_vec_info> (vinfo))
3166             {
3167               /* Mark the statement as not vectorizable.  */
3168               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3169               continue;
3170             }
3171           else
3172             {
3173               datarefs_copy.release ();
3174               return opt_result::failure_at (dr_info->stmt->stmt,
3175                                              "not vectorized:"
3176                                              " complicated access pattern.\n");
3177             }
3178         }
3179     }
3180
3181   datarefs_copy.release ();
3182   return opt_result::success ();
3183 }
3184
3185 /* Function vect_vfa_segment_size.
3186
3187    Input:
3188      DR_INFO: The data reference.
3189      LENGTH_FACTOR: segment length to consider.
3190
3191    Return a value suitable for the dr_with_seg_len::seg_len field.
3192    This is the "distance travelled" by the pointer from the first
3193    iteration in the segment to the last.  Note that it does not include
3194    the size of the access; in effect it only describes the first byte.  */
3195
3196 static tree
3197 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3198 {
3199   length_factor = size_binop (MINUS_EXPR,
3200                               fold_convert (sizetype, length_factor),
3201                               size_one_node);
3202   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3203                      length_factor);
3204 }
3205
3206 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3207    gives the worst-case number of bytes covered by the segment.  */
3208
3209 static unsigned HOST_WIDE_INT
3210 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3211 {
3212   stmt_vec_info stmt_vinfo = dr_info->stmt;
3213   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3214   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3215   unsigned HOST_WIDE_INT access_size = ref_size;
3216   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3217     {
3218       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3219       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3220     }
3221   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3222       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3223           == dr_explicit_realign_optimized))
3224     {
3225       /* We might access a full vector's worth.  */
3226       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3227       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3228     }
3229   return access_size;
3230 }
3231
3232 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3233    describes.  */
3234
3235 static unsigned int
3236 vect_vfa_align (dr_vec_info *dr_info)
3237 {
3238   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3239 }
3240
3241 /* Function vect_no_alias_p.
3242
3243    Given data references A and B with equal base and offset, see whether
3244    the alias relation can be decided at compilation time.  Return 1 if
3245    it can and the references alias, 0 if it can and the references do
3246    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3247    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3248    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3249
3250 static int
3251 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3252                          tree segment_length_a, tree segment_length_b,
3253                          unsigned HOST_WIDE_INT access_size_a,
3254                          unsigned HOST_WIDE_INT access_size_b)
3255 {
3256   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3257   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3258   poly_uint64 const_length_a;
3259   poly_uint64 const_length_b;
3260
3261   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3262      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3263      [a, a+12) */
3264   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3265     {
3266       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3267       offset_a -= const_length_a;
3268     }
3269   else
3270     const_length_a = tree_to_poly_uint64 (segment_length_a);
3271   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3272     {
3273       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3274       offset_b -= const_length_b;
3275     }
3276   else
3277     const_length_b = tree_to_poly_uint64 (segment_length_b);
3278
3279   const_length_a += access_size_a;
3280   const_length_b += access_size_b;
3281
3282   if (ranges_known_overlap_p (offset_a, const_length_a,
3283                               offset_b, const_length_b))
3284     return 1;
3285
3286   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3287                                offset_b, const_length_b))
3288     return 0;
3289
3290   return -1;
3291 }
3292
3293 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3294    in DDR is >= VF.  */
3295
3296 static bool
3297 dependence_distance_ge_vf (data_dependence_relation *ddr,
3298                            unsigned int loop_depth, poly_uint64 vf)
3299 {
3300   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3301       || DDR_NUM_DIST_VECTS (ddr) == 0)
3302     return false;
3303
3304   /* If the dependence is exact, we should have limited the VF instead.  */
3305   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3306
3307   unsigned int i;
3308   lambda_vector dist_v;
3309   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3310     {
3311       HOST_WIDE_INT dist = dist_v[loop_depth];
3312       if (dist != 0
3313           && !(dist > 0 && DDR_REVERSED_P (ddr))
3314           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3315         return false;
3316     }
3317
3318   if (dump_enabled_p ())
3319     dump_printf_loc (MSG_NOTE, vect_location,
3320                      "dependence distance between %T and %T is >= VF\n",
3321                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3322
3323   return true;
3324 }
3325
3326 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3327
3328 static void
3329 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3330 {
3331   dump_printf (dump_kind, "%s (%T) >= ",
3332                lower_bound.unsigned_p ? "unsigned" : "abs",
3333                lower_bound.expr);
3334   dump_dec (dump_kind, lower_bound.min_value);
3335 }
3336
3337 /* Record that the vectorized loop requires the vec_lower_bound described
3338    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3339
3340 static void
3341 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3342                         poly_uint64 min_value)
3343 {
3344   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3345   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3346     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3347       {
3348         unsigned_p &= lower_bounds[i].unsigned_p;
3349         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3350         if (lower_bounds[i].unsigned_p != unsigned_p
3351             || maybe_lt (lower_bounds[i].min_value, min_value))
3352           {
3353             lower_bounds[i].unsigned_p = unsigned_p;
3354             lower_bounds[i].min_value = min_value;
3355             if (dump_enabled_p ())
3356               {
3357                 dump_printf_loc (MSG_NOTE, vect_location,
3358                                  "updating run-time check to ");
3359                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3360                 dump_printf (MSG_NOTE, "\n");
3361               }
3362           }
3363         return;
3364       }
3365
3366   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3367   if (dump_enabled_p ())
3368     {
3369       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3370       dump_lower_bound (MSG_NOTE, lower_bound);
3371       dump_printf (MSG_NOTE, "\n");
3372     }
3373   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3374 }
3375
3376 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3377    will span fewer than GAP bytes.  */
3378
3379 static bool
3380 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3381                   poly_int64 gap)
3382 {
3383   stmt_vec_info stmt_info = dr_info->stmt;
3384   HOST_WIDE_INT count
3385     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3386   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3387     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3388   return (estimated_poly_value (gap)
3389           <= count * vect_get_scalar_dr_size (dr_info));
3390 }
3391
3392 /* Return true if we know that there is no alias between DR_INFO_A and
3393    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3394    When returning true, set *LOWER_BOUND_OUT to this N.  */
3395
3396 static bool
3397 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3398                                 poly_uint64 *lower_bound_out)
3399 {
3400   /* Check that there is a constant gap of known sign between DR_A
3401      and DR_B.  */
3402   data_reference *dr_a = dr_info_a->dr;
3403   data_reference *dr_b = dr_info_b->dr;
3404   poly_int64 init_a, init_b;
3405   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3406       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3407       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3408       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3409       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3410       || !ordered_p (init_a, init_b))
3411     return false;
3412
3413   /* Sort DR_A and DR_B by the address they access.  */
3414   if (maybe_lt (init_b, init_a))
3415     {
3416       std::swap (init_a, init_b);
3417       std::swap (dr_info_a, dr_info_b);
3418       std::swap (dr_a, dr_b);
3419     }
3420
3421   /* If the two accesses could be dependent within a scalar iteration,
3422      make sure that we'd retain their order.  */
3423   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3424       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3425     return false;
3426
3427   /* There is no alias if abs (DR_STEP) is greater than or equal to
3428      the bytes spanned by the combination of the two accesses.  */
3429   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3430   return true;
3431 }
3432
3433 /* Function vect_prune_runtime_alias_test_list.
3434
3435    Prune a list of ddrs to be tested at run-time by versioning for alias.
3436    Merge several alias checks into one if possible.
3437    Return FALSE if resulting list of ddrs is longer then allowed by
3438    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3439
3440 opt_result
3441 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3442 {
3443   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3444   hash_set <tree_pair_hash> compared_objects;
3445
3446   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3447   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3448     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3449   vec<vec_object_pair> &check_unequal_addrs
3450     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3451   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3452   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3453
3454   ddr_p ddr;
3455   unsigned int i;
3456   tree length_factor;
3457
3458   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3459
3460   /* Step values are irrelevant for aliasing if the number of vector
3461      iterations is equal to the number of scalar iterations (which can
3462      happen for fully-SLP loops).  */
3463   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3464
3465   if (!ignore_step_p)
3466     {
3467       /* Convert the checks for nonzero steps into bound tests.  */
3468       tree value;
3469       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3470         vect_check_lower_bound (loop_vinfo, value, true, 1);
3471     }
3472
3473   if (may_alias_ddrs.is_empty ())
3474     return opt_result::success ();
3475
3476   comp_alias_ddrs.create (may_alias_ddrs.length ());
3477
3478   unsigned int loop_depth
3479     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3480                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3481
3482   /* First, we collect all data ref pairs for aliasing checks.  */
3483   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3484     {
3485       poly_uint64 lower_bound;
3486       tree segment_length_a, segment_length_b;
3487       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3488       unsigned int align_a, align_b;
3489
3490       /* Ignore the alias if the VF we chose ended up being no greater
3491          than the dependence distance.  */
3492       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3493         continue;
3494
3495       if (DDR_OBJECT_A (ddr))
3496         {
3497           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3498           if (!compared_objects.add (new_pair))
3499             {
3500               if (dump_enabled_p ())
3501                 dump_printf_loc (MSG_NOTE, vect_location,
3502                                  "checking that %T and %T"
3503                                  " have different addresses\n",
3504                                  new_pair.first, new_pair.second);
3505               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3506             }
3507           continue;
3508         }
3509
3510       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3511       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3512
3513       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3514       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3515
3516       bool preserves_scalar_order_p
3517         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3518
3519       /* Skip the pair if inter-iteration dependencies are irrelevant
3520          and intra-iteration dependencies are guaranteed to be honored.  */
3521       if (ignore_step_p
3522           && (preserves_scalar_order_p
3523               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3524                                                  &lower_bound)))
3525         {
3526           if (dump_enabled_p ())
3527             dump_printf_loc (MSG_NOTE, vect_location,
3528                              "no need for alias check between "
3529                              "%T and %T when VF is 1\n",
3530                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3531           continue;
3532         }
3533
3534       /* See whether we can handle the alias using a bounds check on
3535          the step, and whether that's likely to be the best approach.
3536          (It might not be, for example, if the minimum step is much larger
3537          than the number of bytes handled by one vector iteration.)  */
3538       if (!ignore_step_p
3539           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3540           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3541                                              &lower_bound)
3542           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3543               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3544         {
3545           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3546           if (dump_enabled_p ())
3547             {
3548               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3549                                "%T and %T when the step %T is outside ",
3550                                DR_REF (dr_info_a->dr),
3551                                DR_REF (dr_info_b->dr),
3552                                DR_STEP (dr_info_a->dr));
3553               if (unsigned_p)
3554                 dump_printf (MSG_NOTE, "[0");
3555               else
3556                 {
3557                   dump_printf (MSG_NOTE, "(");
3558                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3559                 }
3560               dump_printf (MSG_NOTE, ", ");
3561               dump_dec (MSG_NOTE, lower_bound);
3562               dump_printf (MSG_NOTE, ")\n");
3563             }
3564           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3565                                   unsigned_p, lower_bound);
3566           continue;
3567         }
3568
3569       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3570       if (dr_group_first_a)
3571         {
3572           stmt_info_a = dr_group_first_a;
3573           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3574         }
3575
3576       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3577       if (dr_group_first_b)
3578         {
3579           stmt_info_b = dr_group_first_b;
3580           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3581         }
3582
3583       if (ignore_step_p)
3584         {
3585           segment_length_a = size_zero_node;
3586           segment_length_b = size_zero_node;
3587         }
3588       else
3589         {
3590           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3591                                 DR_STEP (dr_info_b->dr), 0))
3592             length_factor = scalar_loop_iters;
3593           else
3594             length_factor = size_int (vect_factor);
3595           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3596           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3597         }
3598       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3599       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3600       align_a = vect_vfa_align (dr_info_a);
3601       align_b = vect_vfa_align (dr_info_b);
3602
3603       /* See whether the alias is known at compilation time.  */
3604       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3605                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3606           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3607                               DR_OFFSET (dr_info_b->dr), 0)
3608           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3609           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3610           && poly_int_tree_p (segment_length_a)
3611           && poly_int_tree_p (segment_length_b))
3612         {
3613           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3614                                              segment_length_a,
3615                                              segment_length_b,
3616                                              access_size_a,
3617                                              access_size_b);
3618           if (res >= 0 && dump_enabled_p ())
3619             {
3620               dump_printf_loc (MSG_NOTE, vect_location,
3621                                "can tell at compile time that %T and %T",
3622                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3623               if (res == 0)
3624                 dump_printf (MSG_NOTE, " do not alias\n");
3625               else
3626                 dump_printf (MSG_NOTE, " alias\n");
3627             }
3628
3629           if (res == 0)
3630             continue;
3631
3632           if (res == 1)
3633             return opt_result::failure_at (stmt_info_b->stmt,
3634                                            "not vectorized:"
3635                                            " compilation time alias: %G%G",
3636                                            stmt_info_a->stmt,
3637                                            stmt_info_b->stmt);
3638         }
3639
3640       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3641                             access_size_a, align_a);
3642       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3643                             access_size_b, align_b);
3644       /* Canonicalize the order to be the one that's needed for accurate
3645          RAW, WAR and WAW flags, in cases where the data references are
3646          well-ordered.  The order doesn't really matter otherwise,
3647          but we might as well be consistent.  */
3648       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3649         std::swap (dr_a, dr_b);
3650
3651       dr_with_seg_len_pair_t dr_with_seg_len_pair
3652         (dr_a, dr_b, (preserves_scalar_order_p
3653                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3654                       : dr_with_seg_len_pair_t::REORDERED));
3655
3656       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3657     }
3658
3659   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3660
3661   unsigned int count = (comp_alias_ddrs.length ()
3662                         + check_unequal_addrs.length ());
3663
3664   if (dump_enabled_p ())
3665     dump_printf_loc (MSG_NOTE, vect_location,
3666                      "improved number of alias checks from %d to %d\n",
3667                      may_alias_ddrs.length (), count);
3668   unsigned limit = param_vect_max_version_for_alias_checks;
3669   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3670     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3671   if (count > limit)
3672     return opt_result::failure_at
3673       (vect_location,
3674        "number of versioning for alias run-time tests exceeds %d "
3675        "(--param vect-max-version-for-alias-checks)\n", limit);
3676
3677   return opt_result::success ();
3678 }
3679
3680 /* Check whether we can use an internal function for a gather load
3681    or scatter store.  READ_P is true for loads and false for stores.
3682    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3683    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3684    is the type of the offset that is being applied to the invariant
3685    base address.  SCALE is the amount by which the offset should
3686    be multiplied *after* it has been converted to address width.
3687
3688    Return true if the function is supported, storing the function id in
3689    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3690
3691 bool
3692 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3693                           tree vectype, tree memory_type, tree offset_type,
3694                           int scale, internal_fn *ifn_out,
3695                           tree *offset_vectype_out)
3696 {
3697   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3698   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3699   if (element_bits != memory_bits)
3700     /* For now the vector elements must be the same width as the
3701        memory elements.  */
3702     return false;
3703
3704   /* Work out which function we need.  */
3705   internal_fn ifn;
3706   if (read_p)
3707     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3708   else
3709     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3710
3711   for (;;)
3712     {
3713       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3714       if (!offset_vectype)
3715         return false;
3716
3717       /* Test whether the target supports this combination.  */
3718       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3719                                                   offset_vectype, scale))
3720         {
3721           *ifn_out = ifn;
3722           *offset_vectype_out = offset_vectype;
3723           return true;
3724         }
3725
3726       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3727           && TYPE_PRECISION (offset_type) >= element_bits)
3728         return false;
3729
3730       offset_type = build_nonstandard_integer_type
3731         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3732     }
3733 }
3734
3735 /* STMT_INFO is a call to an internal gather load or scatter store function.
3736    Describe the operation in INFO.  */
3737
3738 static void
3739 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3740                                    gather_scatter_info *info)
3741 {
3742   gcall *call = as_a <gcall *> (stmt_info->stmt);
3743   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3744   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3745
3746   info->ifn = gimple_call_internal_fn (call);
3747   info->decl = NULL_TREE;
3748   info->base = gimple_call_arg (call, 0);
3749   info->offset = gimple_call_arg (call, 1);
3750   info->offset_dt = vect_unknown_def_type;
3751   info->offset_vectype = NULL_TREE;
3752   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3753   info->element_type = TREE_TYPE (vectype);
3754   info->memory_type = TREE_TYPE (DR_REF (dr));
3755 }
3756
3757 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3758    gather load or scatter store.  Describe the operation in *INFO if so.  */
3759
3760 bool
3761 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3762                            gather_scatter_info *info)
3763 {
3764   HOST_WIDE_INT scale = 1;
3765   poly_int64 pbitpos, pbitsize;
3766   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3767   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3768   tree offtype = NULL_TREE;
3769   tree decl = NULL_TREE, base, off;
3770   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3771   tree memory_type = TREE_TYPE (DR_REF (dr));
3772   machine_mode pmode;
3773   int punsignedp, reversep, pvolatilep = 0;
3774   internal_fn ifn;
3775   tree offset_vectype;
3776   bool masked_p = false;
3777
3778   /* See whether this is already a call to a gather/scatter internal function.
3779      If not, see whether it's a masked load or store.  */
3780   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3781   if (call && gimple_call_internal_p (call))
3782     {
3783       ifn = gimple_call_internal_fn (call);
3784       if (internal_gather_scatter_fn_p (ifn))
3785         {
3786           vect_describe_gather_scatter_call (stmt_info, info);
3787           return true;
3788         }
3789       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3790     }
3791
3792   /* True if we should aim to use internal functions rather than
3793      built-in functions.  */
3794   bool use_ifn_p = (DR_IS_READ (dr)
3795                     ? supports_vec_gather_load_p ()
3796                     : supports_vec_scatter_store_p ());
3797
3798   base = DR_REF (dr);
3799   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3800      see if we can use the def stmt of the address.  */
3801   if (masked_p
3802       && TREE_CODE (base) == MEM_REF
3803       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3804       && integer_zerop (TREE_OPERAND (base, 1))
3805       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3806     {
3807       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3808       if (is_gimple_assign (def_stmt)
3809           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3810         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3811     }
3812
3813   /* The gather and scatter builtins need address of the form
3814      loop_invariant + vector * {1, 2, 4, 8}
3815      or
3816      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3817      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3818      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3819      multiplications and additions in it.  To get a vector, we need
3820      a single SSA_NAME that will be defined in the loop and will
3821      contain everything that is not loop invariant and that can be
3822      vectorized.  The following code attempts to find such a preexistng
3823      SSA_NAME OFF and put the loop invariants into a tree BASE
3824      that can be gimplified before the loop.  */
3825   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3826                               &punsignedp, &reversep, &pvolatilep);
3827   if (reversep)
3828     return false;
3829
3830   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3831
3832   if (TREE_CODE (base) == MEM_REF)
3833     {
3834       if (!integer_zerop (TREE_OPERAND (base, 1)))
3835         {
3836           if (off == NULL_TREE)
3837             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3838           else
3839             off = size_binop (PLUS_EXPR, off,
3840                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3841         }
3842       base = TREE_OPERAND (base, 0);
3843     }
3844   else
3845     base = build_fold_addr_expr (base);
3846
3847   if (off == NULL_TREE)
3848     off = size_zero_node;
3849
3850   /* If base is not loop invariant, either off is 0, then we start with just
3851      the constant offset in the loop invariant BASE and continue with base
3852      as OFF, otherwise give up.
3853      We could handle that case by gimplifying the addition of base + off
3854      into some SSA_NAME and use that as off, but for now punt.  */
3855   if (!expr_invariant_in_loop_p (loop, base))
3856     {
3857       if (!integer_zerop (off))
3858         return false;
3859       off = base;
3860       base = size_int (pbytepos);
3861     }
3862   /* Otherwise put base + constant offset into the loop invariant BASE
3863      and continue with OFF.  */
3864   else
3865     {
3866       base = fold_convert (sizetype, base);
3867       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3868     }
3869
3870   /* OFF at this point may be either a SSA_NAME or some tree expression
3871      from get_inner_reference.  Try to peel off loop invariants from it
3872      into BASE as long as possible.  */
3873   STRIP_NOPS (off);
3874   while (offtype == NULL_TREE)
3875     {
3876       enum tree_code code;
3877       tree op0, op1, add = NULL_TREE;
3878
3879       if (TREE_CODE (off) == SSA_NAME)
3880         {
3881           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3882
3883           if (expr_invariant_in_loop_p (loop, off))
3884             return false;
3885
3886           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3887             break;
3888
3889           op0 = gimple_assign_rhs1 (def_stmt);
3890           code = gimple_assign_rhs_code (def_stmt);
3891           op1 = gimple_assign_rhs2 (def_stmt);
3892         }
3893       else
3894         {
3895           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3896             return false;
3897           code = TREE_CODE (off);
3898           extract_ops_from_tree (off, &code, &op0, &op1);
3899         }
3900       switch (code)
3901         {
3902         case POINTER_PLUS_EXPR:
3903         case PLUS_EXPR:
3904           if (expr_invariant_in_loop_p (loop, op0))
3905             {
3906               add = op0;
3907               off = op1;
3908             do_add:
3909               add = fold_convert (sizetype, add);
3910               if (scale != 1)
3911                 add = size_binop (MULT_EXPR, add, size_int (scale));
3912               base = size_binop (PLUS_EXPR, base, add);
3913               continue;
3914             }
3915           if (expr_invariant_in_loop_p (loop, op1))
3916             {
3917               add = op1;
3918               off = op0;
3919               goto do_add;
3920             }
3921           break;
3922         case MINUS_EXPR:
3923           if (expr_invariant_in_loop_p (loop, op1))
3924             {
3925               add = fold_convert (sizetype, op1);
3926               add = size_binop (MINUS_EXPR, size_zero_node, add);
3927               off = op0;
3928               goto do_add;
3929             }
3930           break;
3931         case MULT_EXPR:
3932           if (scale == 1 && tree_fits_shwi_p (op1))
3933             {
3934               int new_scale = tree_to_shwi (op1);
3935               /* Only treat this as a scaling operation if the target
3936                  supports it for at least some offset type.  */
3937               if (use_ifn_p
3938                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3939                                                 masked_p, vectype, memory_type,
3940                                                 signed_char_type_node,
3941                                                 new_scale, &ifn,
3942                                                 &offset_vectype)
3943                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3944                                                 masked_p, vectype, memory_type,
3945                                                 unsigned_char_type_node,
3946                                                 new_scale, &ifn,
3947                                                 &offset_vectype))
3948                 break;
3949               scale = new_scale;
3950               off = op0;
3951               continue;
3952             }
3953           break;
3954         case SSA_NAME:
3955           off = op0;
3956           continue;
3957         CASE_CONVERT:
3958           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3959               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3960             break;
3961
3962           /* Don't include the conversion if the target is happy with
3963              the current offset type.  */
3964           if (use_ifn_p
3965               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3966                                            masked_p, vectype, memory_type,
3967                                            TREE_TYPE (off), scale, &ifn,
3968                                            &offset_vectype))
3969             break;
3970
3971           if (TYPE_PRECISION (TREE_TYPE (op0))
3972               == TYPE_PRECISION (TREE_TYPE (off)))
3973             {
3974               off = op0;
3975               continue;
3976             }
3977
3978           if (TYPE_PRECISION (TREE_TYPE (op0))
3979               < TYPE_PRECISION (TREE_TYPE (off)))
3980             {
3981               off = op0;
3982               offtype = TREE_TYPE (off);
3983               STRIP_NOPS (off);
3984               continue;
3985             }
3986           break;
3987         default:
3988           break;
3989         }
3990       break;
3991     }
3992
3993   /* If at the end OFF still isn't a SSA_NAME or isn't
3994      defined in the loop, punt.  */
3995   if (TREE_CODE (off) != SSA_NAME
3996       || expr_invariant_in_loop_p (loop, off))
3997     return false;
3998
3999   if (offtype == NULL_TREE)
4000     offtype = TREE_TYPE (off);
4001
4002   if (use_ifn_p)
4003     {
4004       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4005                                      vectype, memory_type, offtype, scale,
4006                                      &ifn, &offset_vectype))
4007         return false;
4008     }
4009   else
4010     {
4011       if (DR_IS_READ (dr))
4012         {
4013           if (targetm.vectorize.builtin_gather)
4014             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4015         }
4016       else
4017         {
4018           if (targetm.vectorize.builtin_scatter)
4019             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4020         }
4021
4022       if (!decl)
4023         return false;
4024
4025       ifn = IFN_LAST;
4026       /* The offset vector type will be read from DECL when needed.  */
4027       offset_vectype = NULL_TREE;
4028     }
4029
4030   info->ifn = ifn;
4031   info->decl = decl;
4032   info->base = base;
4033   info->offset = off;
4034   info->offset_dt = vect_unknown_def_type;
4035   info->offset_vectype = offset_vectype;
4036   info->scale = scale;
4037   info->element_type = TREE_TYPE (vectype);
4038   info->memory_type = memory_type;
4039   return true;
4040 }
4041
4042 /* Find the data references in STMT, analyze them with respect to LOOP and
4043    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4044    be handled.  */
4045
4046 opt_result
4047 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4048                                vec<data_reference_p> *datarefs)
4049 {
4050   /* We can ignore clobbers for dataref analysis - they are removed during
4051      loop vectorization and BB vectorization checks dependences with a
4052      stmt walk.  */
4053   if (gimple_clobber_p (stmt))
4054     return opt_result::success ();
4055
4056   if (gimple_has_volatile_ops (stmt))
4057     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4058                                    stmt);
4059
4060   if (stmt_can_throw_internal (cfun, stmt))
4061     return opt_result::failure_at (stmt,
4062                                    "not vectorized:"
4063                                    " statement can throw an exception: %G",
4064                                    stmt);
4065
4066   auto_vec<data_reference_p, 2> refs;
4067   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4068   if (!res)
4069     return res;
4070
4071   if (refs.is_empty ())
4072     return opt_result::success ();
4073
4074   if (refs.length () > 1)
4075     return opt_result::failure_at (stmt,
4076                                    "not vectorized:"
4077                                    " more than one data ref in stmt: %G", stmt);
4078
4079   if (gcall *call = dyn_cast <gcall *> (stmt))
4080     if (!gimple_call_internal_p (call)
4081         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4082             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4083       return opt_result::failure_at (stmt,
4084                                      "not vectorized: dr in a call %G", stmt);
4085
4086   data_reference_p dr = refs.pop ();
4087   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4088       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4089     return opt_result::failure_at (stmt,
4090                                    "not vectorized:"
4091                                    " statement is bitfield access %G", stmt);
4092
4093   if (DR_BASE_ADDRESS (dr)
4094       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4095     return opt_result::failure_at (stmt,
4096                                    "not vectorized:"
4097                                    " base addr of dr is a constant\n");
4098
4099   /* Check whether this may be a SIMD lane access and adjust the
4100      DR to make it easier for us to handle it.  */
4101   if (loop
4102       && loop->simduid
4103       && (!DR_BASE_ADDRESS (dr)
4104           || !DR_OFFSET (dr)
4105           || !DR_INIT (dr)
4106           || !DR_STEP (dr)))
4107     {
4108       struct data_reference *newdr
4109         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4110                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4111       if (DR_BASE_ADDRESS (newdr)
4112           && DR_OFFSET (newdr)
4113           && DR_INIT (newdr)
4114           && DR_STEP (newdr)
4115           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4116           && integer_zerop (DR_STEP (newdr)))
4117         {
4118           tree base_address = DR_BASE_ADDRESS (newdr);
4119           tree off = DR_OFFSET (newdr);
4120           tree step = ssize_int (1);
4121           if (integer_zerop (off)
4122               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4123             {
4124               off = TREE_OPERAND (base_address, 1);
4125               base_address = TREE_OPERAND (base_address, 0);
4126             }
4127           STRIP_NOPS (off);
4128           if (TREE_CODE (off) == MULT_EXPR
4129               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4130             {
4131               step = TREE_OPERAND (off, 1);
4132               off = TREE_OPERAND (off, 0);
4133               STRIP_NOPS (off);
4134             }
4135           if (CONVERT_EXPR_P (off)
4136               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4137                   < TYPE_PRECISION (TREE_TYPE (off))))
4138             off = TREE_OPERAND (off, 0);
4139           if (TREE_CODE (off) == SSA_NAME)
4140             {
4141               gimple *def = SSA_NAME_DEF_STMT (off);
4142               /* Look through widening conversion.  */
4143               if (is_gimple_assign (def)
4144                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4145                 {
4146                   tree rhs1 = gimple_assign_rhs1 (def);
4147                   if (TREE_CODE (rhs1) == SSA_NAME
4148                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4149                       && (TYPE_PRECISION (TREE_TYPE (off))
4150                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4151                     def = SSA_NAME_DEF_STMT (rhs1);
4152                 }
4153               if (is_gimple_call (def)
4154                   && gimple_call_internal_p (def)
4155                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4156                 {
4157                   tree arg = gimple_call_arg (def, 0);
4158                   tree reft = TREE_TYPE (DR_REF (newdr));
4159                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4160                   arg = SSA_NAME_VAR (arg);
4161                   if (arg == loop->simduid
4162                       /* For now.  */
4163                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4164                     {
4165                       DR_BASE_ADDRESS (newdr) = base_address;
4166                       DR_OFFSET (newdr) = ssize_int (0);
4167                       DR_STEP (newdr) = step;
4168                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4169                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4170                       /* Mark as simd-lane access.  */
4171                       tree arg2 = gimple_call_arg (def, 1);
4172                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4173                       free_data_ref (dr);
4174                       datarefs->safe_push (newdr);
4175                       return opt_result::success ();
4176                     }
4177                 }
4178             }
4179         }
4180       free_data_ref (newdr);
4181     }
4182
4183   datarefs->safe_push (dr);
4184   return opt_result::success ();
4185 }
4186
4187 /* Function vect_analyze_data_refs.
4188
4189   Find all the data references in the loop or basic block.
4190
4191    The general structure of the analysis of data refs in the vectorizer is as
4192    follows:
4193    1- vect_analyze_data_refs(loop/bb): call
4194       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4195       in the loop/bb and their dependences.
4196    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4197    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4198    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4199
4200 */
4201
4202 opt_result
4203 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4204 {
4205   class loop *loop = NULL;
4206   unsigned int i;
4207   struct data_reference *dr;
4208   tree scalar_type;
4209
4210   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4211
4212   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4213     loop = LOOP_VINFO_LOOP (loop_vinfo);
4214
4215   /* Go through the data-refs, check that the analysis succeeded.  Update
4216      pointer from stmt_vec_info struct to DR and vectype.  */
4217
4218   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4219   FOR_EACH_VEC_ELT (datarefs, i, dr)
4220     {
4221       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4222       poly_uint64 vf;
4223
4224       gcc_assert (DR_REF (dr));
4225       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4226       gcc_assert (!stmt_info->dr_aux.dr);
4227       stmt_info->dr_aux.dr = dr;
4228       stmt_info->dr_aux.stmt = stmt_info;
4229
4230       /* Check that analysis of the data-ref succeeded.  */
4231       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4232           || !DR_STEP (dr))
4233         {
4234           bool maybe_gather
4235             = DR_IS_READ (dr)
4236               && !TREE_THIS_VOLATILE (DR_REF (dr))
4237               && (targetm.vectorize.builtin_gather != NULL
4238                   || supports_vec_gather_load_p ());
4239           bool maybe_scatter
4240             = DR_IS_WRITE (dr)
4241               && !TREE_THIS_VOLATILE (DR_REF (dr))
4242               && (targetm.vectorize.builtin_scatter != NULL
4243                   || supports_vec_scatter_store_p ());
4244
4245           /* If target supports vector gather loads or scatter stores,
4246              see if they can't be used.  */
4247           if (is_a <loop_vec_info> (vinfo)
4248               && !nested_in_vect_loop_p (loop, stmt_info))
4249             {
4250               if (maybe_gather || maybe_scatter)
4251                 {
4252                   if (maybe_gather)
4253                     gatherscatter = GATHER;
4254                   else
4255                     gatherscatter = SCATTER;
4256                 }
4257             }
4258
4259           if (gatherscatter == SG_NONE)
4260             {
4261               if (dump_enabled_p ())
4262                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4263                                  "not vectorized: data ref analysis "
4264                                  "failed %G", stmt_info->stmt);
4265               if (is_a <bb_vec_info> (vinfo))
4266                 {
4267                   /* In BB vectorization the ref can still participate
4268                      in dependence analysis, we just can't vectorize it.  */
4269                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4270                   continue;
4271                 }
4272               return opt_result::failure_at (stmt_info->stmt,
4273                                              "not vectorized:"
4274                                              " data ref analysis failed: %G",
4275                                              stmt_info->stmt);
4276             }
4277         }
4278
4279       /* See if this was detected as SIMD lane access.  */
4280       if (dr->aux == (void *)-1
4281           || dr->aux == (void *)-2
4282           || dr->aux == (void *)-3
4283           || dr->aux == (void *)-4)
4284         {
4285           if (nested_in_vect_loop_p (loop, stmt_info))
4286             return opt_result::failure_at (stmt_info->stmt,
4287                                            "not vectorized:"
4288                                            " data ref analysis failed: %G",
4289                                            stmt_info->stmt);
4290           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4291             = -(uintptr_t) dr->aux;
4292         }
4293
4294       tree base = get_base_address (DR_REF (dr));
4295       if (base && VAR_P (base) && DECL_NONALIASED (base))
4296         {
4297           if (dump_enabled_p ())
4298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4299                              "not vectorized: base object not addressable "
4300                              "for stmt: %G", stmt_info->stmt);
4301           if (is_a <bb_vec_info> (vinfo))
4302             {
4303               /* In BB vectorization the ref can still participate
4304                  in dependence analysis, we just can't vectorize it.  */
4305               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4306               continue;
4307             }
4308           return opt_result::failure_at (stmt_info->stmt,
4309                                          "not vectorized: base object not"
4310                                          " addressable for stmt: %G",
4311                                          stmt_info->stmt);
4312         }
4313
4314       if (is_a <loop_vec_info> (vinfo)
4315           && DR_STEP (dr)
4316           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4317         {
4318           if (nested_in_vect_loop_p (loop, stmt_info))
4319             return opt_result::failure_at (stmt_info->stmt,
4320                                            "not vectorized: "
4321                                            "not suitable for strided load %G",
4322                                            stmt_info->stmt);
4323           STMT_VINFO_STRIDED_P (stmt_info) = true;
4324         }
4325
4326       /* Update DR field in stmt_vec_info struct.  */
4327
4328       /* If the dataref is in an inner-loop of the loop that is considered for
4329          for vectorization, we also want to analyze the access relative to
4330          the outer-loop (DR contains information only relative to the
4331          inner-most enclosing loop).  We do that by building a reference to the
4332          first location accessed by the inner-loop, and analyze it relative to
4333          the outer-loop.  */
4334       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4335         {
4336           /* Build a reference to the first location accessed by the
4337              inner loop: *(BASE + INIT + OFFSET).  By construction,
4338              this address must be invariant in the inner loop, so we
4339              can consider it as being used in the outer loop.  */
4340           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4341           tree offset = unshare_expr (DR_OFFSET (dr));
4342           tree init = unshare_expr (DR_INIT (dr));
4343           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4344                                           init, offset);
4345           tree init_addr = fold_build_pointer_plus (base, init_offset);
4346           tree init_ref = build_fold_indirect_ref (init_addr);
4347
4348           if (dump_enabled_p ())
4349             dump_printf_loc (MSG_NOTE, vect_location,
4350                              "analyze in outer loop: %T\n", init_ref);
4351
4352           opt_result res
4353             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4354                                     init_ref, loop, stmt_info->stmt);
4355           if (!res)
4356             /* dr_analyze_innermost already explained the failure.  */
4357             return res;
4358
4359           if (dump_enabled_p ())
4360             dump_printf_loc (MSG_NOTE, vect_location,
4361                              "\touter base_address: %T\n"
4362                              "\touter offset from base address: %T\n"
4363                              "\touter constant offset from base address: %T\n"
4364                              "\touter step: %T\n"
4365                              "\touter base alignment: %d\n\n"
4366                              "\touter base misalignment: %d\n"
4367                              "\touter offset alignment: %d\n"
4368                              "\touter step alignment: %d\n",
4369                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4370                              STMT_VINFO_DR_OFFSET (stmt_info),
4371                              STMT_VINFO_DR_INIT (stmt_info),
4372                              STMT_VINFO_DR_STEP (stmt_info),
4373                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4374                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4375                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4376                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4377         }
4378
4379       /* Set vectype for STMT.  */
4380       scalar_type = TREE_TYPE (DR_REF (dr));
4381       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4382       if (!vectype)
4383         {
4384           if (dump_enabled_p ())
4385             {
4386               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4387                                "not vectorized: no vectype for stmt: %G",
4388                                stmt_info->stmt);
4389               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4390               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4391                                  scalar_type);
4392               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4393             }
4394
4395           if (is_a <bb_vec_info> (vinfo))
4396             {
4397               /* No vector type is fine, the ref can still participate
4398                  in dependence analysis, we just can't vectorize it.  */
4399               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4400               continue;
4401             }
4402           if (fatal)
4403             *fatal = false;
4404           return opt_result::failure_at (stmt_info->stmt,
4405                                          "not vectorized:"
4406                                          " no vectype for stmt: %G"
4407                                          " scalar_type: %T\n",
4408                                          stmt_info->stmt, scalar_type);
4409         }
4410       else
4411         {
4412           if (dump_enabled_p ())
4413             dump_printf_loc (MSG_NOTE, vect_location,
4414                              "got vectype for stmt: %G%T\n",
4415                              stmt_info->stmt, vectype);
4416         }
4417
4418       /* Adjust the minimal vectorization factor according to the
4419          vector type.  */
4420       vf = TYPE_VECTOR_SUBPARTS (vectype);
4421       *min_vf = upper_bound (*min_vf, vf);
4422
4423       /* Leave the BB vectorizer to pick the vector type later, based on
4424          the final dataref group size and SLP node size.  */
4425       if (is_a <loop_vec_info> (vinfo))
4426         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4427
4428       if (gatherscatter != SG_NONE)
4429         {
4430           gather_scatter_info gs_info;
4431           if (!vect_check_gather_scatter (stmt_info,
4432                                           as_a <loop_vec_info> (vinfo),
4433                                           &gs_info)
4434               || !get_vectype_for_scalar_type (vinfo,
4435                                                TREE_TYPE (gs_info.offset)))
4436             {
4437               if (fatal)
4438                 *fatal = false;
4439               return opt_result::failure_at
4440                         (stmt_info->stmt,
4441                          (gatherscatter == GATHER)
4442                          ? "not vectorized: not suitable for gather load %G"
4443                          : "not vectorized: not suitable for scatter store %G",
4444                          stmt_info->stmt);
4445             }
4446           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4447         }
4448     }
4449
4450   /* We used to stop processing and prune the list here.  Verify we no
4451      longer need to.  */
4452   gcc_assert (i == datarefs.length ());
4453
4454   return opt_result::success ();
4455 }
4456
4457
4458 /* Function vect_get_new_vect_var.
4459
4460    Returns a name for a new variable.  The current naming scheme appends the
4461    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4462    the name of vectorizer generated variables, and appends that to NAME if
4463    provided.  */
4464
4465 tree
4466 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4467 {
4468   const char *prefix;
4469   tree new_vect_var;
4470
4471   switch (var_kind)
4472   {
4473   case vect_simple_var:
4474     prefix = "vect";
4475     break;
4476   case vect_scalar_var:
4477     prefix = "stmp";
4478     break;
4479   case vect_mask_var:
4480     prefix = "mask";
4481     break;
4482   case vect_pointer_var:
4483     prefix = "vectp";
4484     break;
4485   default:
4486     gcc_unreachable ();
4487   }
4488
4489   if (name)
4490     {
4491       char* tmp = concat (prefix, "_", name, NULL);
4492       new_vect_var = create_tmp_reg (type, tmp);
4493       free (tmp);
4494     }
4495   else
4496     new_vect_var = create_tmp_reg (type, prefix);
4497
4498   return new_vect_var;
4499 }
4500
4501 /* Like vect_get_new_vect_var but return an SSA name.  */
4502
4503 tree
4504 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4505 {
4506   const char *prefix;
4507   tree new_vect_var;
4508
4509   switch (var_kind)
4510   {
4511   case vect_simple_var:
4512     prefix = "vect";
4513     break;
4514   case vect_scalar_var:
4515     prefix = "stmp";
4516     break;
4517   case vect_pointer_var:
4518     prefix = "vectp";
4519     break;
4520   default:
4521     gcc_unreachable ();
4522   }
4523
4524   if (name)
4525     {
4526       char* tmp = concat (prefix, "_", name, NULL);
4527       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4528       free (tmp);
4529     }
4530   else
4531     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4532
4533   return new_vect_var;
4534 }
4535
4536 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4537
4538 static void
4539 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4540 {
4541   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4542   int misalign = DR_MISALIGNMENT (dr_info);
4543   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4544     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4545   else
4546     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4547                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4548                             misalign);
4549 }
4550
4551 /* Function vect_create_addr_base_for_vector_ref.
4552
4553    Create an expression that computes the address of the first memory location
4554    that will be accessed for a data reference.
4555
4556    Input:
4557    STMT_INFO: The statement containing the data reference.
4558    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4559    OFFSET: Optional. If supplied, it is be added to the initial address.
4560    LOOP:    Specify relative to which loop-nest should the address be computed.
4561             For example, when the dataref is in an inner-loop nested in an
4562             outer-loop that is now being vectorized, LOOP can be either the
4563             outer-loop, or the inner-loop.  The first memory location accessed
4564             by the following dataref ('in' points to short):
4565
4566                 for (i=0; i<N; i++)
4567                    for (j=0; j<M; j++)
4568                      s += in[i+j]
4569
4570             is as follows:
4571             if LOOP=i_loop:     &in             (relative to i_loop)
4572             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4573    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4574             initial address.  Unlike OFFSET, which is number of elements to
4575             be added, BYTE_OFFSET is measured in bytes.
4576
4577    Output:
4578    1. Return an SSA_NAME whose value is the address of the memory location of
4579       the first vector of the data reference.
4580    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4581       these statement(s) which define the returned SSA_NAME.
4582
4583    FORNOW: We are only handling array accesses with step 1.  */
4584
4585 tree
4586 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4587                                       gimple_seq *new_stmt_list,
4588                                       tree offset,
4589                                       tree byte_offset)
4590 {
4591   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4592   struct data_reference *dr = dr_info->dr;
4593   const char *base_name;
4594   tree addr_base;
4595   tree dest;
4596   gimple_seq seq = NULL;
4597   tree vect_ptr_type;
4598   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4599   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4600   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4601
4602   tree data_ref_base = unshare_expr (drb->base_address);
4603   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4604   tree init = unshare_expr (drb->init);
4605
4606   if (loop_vinfo)
4607     base_name = get_name (data_ref_base);
4608   else
4609     {
4610       base_offset = ssize_int (0);
4611       init = ssize_int (0);
4612       base_name = get_name (DR_REF (dr));
4613     }
4614
4615   /* Create base_offset */
4616   base_offset = size_binop (PLUS_EXPR,
4617                             fold_convert (sizetype, base_offset),
4618                             fold_convert (sizetype, init));
4619
4620   if (offset)
4621     {
4622       offset = fold_build2 (MULT_EXPR, sizetype,
4623                             fold_convert (sizetype, offset), step);
4624       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4625                                  base_offset, offset);
4626     }
4627   if (byte_offset)
4628     {
4629       byte_offset = fold_convert (sizetype, byte_offset);
4630       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4631                                  base_offset, byte_offset);
4632     }
4633
4634   /* base + base_offset */
4635   if (loop_vinfo)
4636     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4637   else
4638     {
4639       addr_base = build1 (ADDR_EXPR,
4640                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4641                           unshare_expr (DR_REF (dr)));
4642     }
4643
4644   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4645   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4646   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4647   gimple_seq_add_seq (new_stmt_list, seq);
4648
4649   if (DR_PTR_INFO (dr)
4650       && TREE_CODE (addr_base) == SSA_NAME
4651       && !SSA_NAME_PTR_INFO (addr_base))
4652     {
4653       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4654       if (offset || byte_offset)
4655         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4656     }
4657
4658   if (dump_enabled_p ())
4659     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4660
4661   return addr_base;
4662 }
4663
4664
4665 /* Function vect_create_data_ref_ptr.
4666
4667    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4668    location accessed in the loop by STMT_INFO, along with the def-use update
4669    chain to appropriately advance the pointer through the loop iterations.
4670    Also set aliasing information for the pointer.  This pointer is used by
4671    the callers to this function to create a memory reference expression for
4672    vector load/store access.
4673
4674    Input:
4675    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4676          GIMPLE_ASSIGN <name, data-ref> or
4677          GIMPLE_ASSIGN <data-ref, name>.
4678    2. AGGR_TYPE: the type of the reference, which should be either a vector
4679         or an array.
4680    3. AT_LOOP: the loop where the vector memref is to be created.
4681    4. OFFSET (optional): an offset to be added to the initial address accessed
4682         by the data-ref in STMT_INFO.
4683    5. BSI: location where the new stmts are to be placed if there is no loop
4684    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4685         pointing to the initial address.
4686    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4687         to the initial address accessed by the data-ref in STMT_INFO.  This is
4688         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4689         in bytes.
4690    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4691         to the IV during each iteration of the loop.  NULL says to move
4692         by one copy of AGGR_TYPE up or down, depending on the step of the
4693         data reference.
4694
4695    Output:
4696    1. Declare a new ptr to vector_type, and have it point to the base of the
4697       data reference (initial addressed accessed by the data reference).
4698       For example, for vector of type V8HI, the following code is generated:
4699
4700       v8hi *ap;
4701       ap = (v8hi *)initial_address;
4702
4703       if OFFSET is not supplied:
4704          initial_address = &a[init];
4705       if OFFSET is supplied:
4706          initial_address = &a[init + OFFSET];
4707       if BYTE_OFFSET is supplied:
4708          initial_address = &a[init] + BYTE_OFFSET;
4709
4710       Return the initial_address in INITIAL_ADDRESS.
4711
4712    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4713       update the pointer in each iteration of the loop.
4714
4715       Return the increment stmt that updates the pointer in PTR_INCR.
4716
4717    3. Return the pointer.  */
4718
4719 tree
4720 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4721                           tree aggr_type, class loop *at_loop, tree offset,
4722                           tree *initial_address, gimple_stmt_iterator *gsi,
4723                           gimple **ptr_incr, bool only_init,
4724                           tree byte_offset, tree iv_step)
4725 {
4726   const char *base_name;
4727   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4728   class loop *loop = NULL;
4729   bool nested_in_vect_loop = false;
4730   class loop *containing_loop = NULL;
4731   tree aggr_ptr_type;
4732   tree aggr_ptr;
4733   tree new_temp;
4734   gimple_seq new_stmt_list = NULL;
4735   edge pe = NULL;
4736   basic_block new_bb;
4737   tree aggr_ptr_init;
4738   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4739   struct data_reference *dr = dr_info->dr;
4740   tree aptr;
4741   gimple_stmt_iterator incr_gsi;
4742   bool insert_after;
4743   tree indx_before_incr, indx_after_incr;
4744   gimple *incr;
4745   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4746
4747   gcc_assert (iv_step != NULL_TREE
4748               || TREE_CODE (aggr_type) == ARRAY_TYPE
4749               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4750
4751   if (loop_vinfo)
4752     {
4753       loop = LOOP_VINFO_LOOP (loop_vinfo);
4754       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4755       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4756       pe = loop_preheader_edge (loop);
4757     }
4758   else
4759     {
4760       gcc_assert (bb_vinfo);
4761       only_init = true;
4762       *ptr_incr = NULL;
4763     }
4764
4765   /* Create an expression for the first address accessed by this load
4766      in LOOP.  */
4767   base_name = get_name (DR_BASE_ADDRESS (dr));
4768
4769   if (dump_enabled_p ())
4770     {
4771       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4772       dump_printf_loc (MSG_NOTE, vect_location,
4773                        "create %s-pointer variable to type: %T",
4774                        get_tree_code_name (TREE_CODE (aggr_type)),
4775                        aggr_type);
4776       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4777         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4778       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4779         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4780       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4781         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4782       else
4783         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4784       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4785     }
4786
4787   /* (1) Create the new aggregate-pointer variable.
4788      Vector and array types inherit the alias set of their component
4789      type by default so we need to use a ref-all pointer if the data
4790      reference does not conflict with the created aggregated data
4791      reference because it is not addressable.  */
4792   bool need_ref_all = false;
4793   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4794                               get_alias_set (DR_REF (dr))))
4795     need_ref_all = true;
4796   /* Likewise for any of the data references in the stmt group.  */
4797   else if (DR_GROUP_SIZE (stmt_info) > 1)
4798     {
4799       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4800       do
4801         {
4802           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4803           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4804                                       get_alias_set (DR_REF (sdr))))
4805             {
4806               need_ref_all = true;
4807               break;
4808             }
4809           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4810         }
4811       while (sinfo);
4812     }
4813   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4814                                                need_ref_all);
4815   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4816
4817
4818   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4819      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4820      def-use update cycles for the pointer: one relative to the outer-loop
4821      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4822      to the inner-loop (which is the inner-most loop containing the dataref),
4823      and this is done be step (5) below.
4824
4825      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4826      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4827      redundant.  Steps (3),(4) create the following:
4828
4829         vp0 = &base_addr;
4830         LOOP:   vp1 = phi(vp0,vp2)
4831                 ...
4832                 ...
4833                 vp2 = vp1 + step
4834                 goto LOOP
4835
4836      If there is an inner-loop nested in loop, then step (5) will also be
4837      applied, and an additional update in the inner-loop will be created:
4838
4839         vp0 = &base_addr;
4840         LOOP:   vp1 = phi(vp0,vp2)
4841                 ...
4842         inner:     vp3 = phi(vp1,vp4)
4843                    vp4 = vp3 + inner_step
4844                    if () goto inner
4845                 ...
4846                 vp2 = vp1 + step
4847                 if () goto LOOP   */
4848
4849   /* (2) Calculate the initial address of the aggregate-pointer, and set
4850      the aggregate-pointer to point to it before the loop.  */
4851
4852   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4853
4854   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4855                                                    stmt_info, &new_stmt_list,
4856                                                    offset, byte_offset);
4857   if (new_stmt_list)
4858     {
4859       if (pe)
4860         {
4861           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4862           gcc_assert (!new_bb);
4863         }
4864       else
4865         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4866     }
4867
4868   *initial_address = new_temp;
4869   aggr_ptr_init = new_temp;
4870
4871   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4872      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4873      inner-loop nested in LOOP (during outer-loop vectorization).  */
4874
4875   /* No update in loop is required.  */
4876   if (only_init && (!loop_vinfo || at_loop == loop))
4877     aptr = aggr_ptr_init;
4878   else
4879     {
4880       /* Accesses to invariant addresses should be handled specially
4881          by the caller.  */
4882       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4883       gcc_assert (!integer_zerop (step));
4884
4885       if (iv_step == NULL_TREE)
4886         {
4887           /* The step of the aggregate pointer is the type size,
4888              negated for downward accesses.  */
4889           iv_step = TYPE_SIZE_UNIT (aggr_type);
4890           if (tree_int_cst_sgn (step) == -1)
4891             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4892         }
4893
4894       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4895
4896       create_iv (aggr_ptr_init,
4897                  fold_convert (aggr_ptr_type, iv_step),
4898                  aggr_ptr, loop, &incr_gsi, insert_after,
4899                  &indx_before_incr, &indx_after_incr);
4900       incr = gsi_stmt (incr_gsi);
4901       loop_vinfo->add_stmt (incr);
4902
4903       /* Copy the points-to information if it exists. */
4904       if (DR_PTR_INFO (dr))
4905         {
4906           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4907           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4908         }
4909       if (ptr_incr)
4910         *ptr_incr = incr;
4911
4912       aptr = indx_before_incr;
4913     }
4914
4915   if (!nested_in_vect_loop || only_init)
4916     return aptr;
4917
4918
4919   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4920      nested in LOOP, if exists.  */
4921
4922   gcc_assert (nested_in_vect_loop);
4923   if (!only_init)
4924     {
4925       standard_iv_increment_position (containing_loop, &incr_gsi,
4926                                       &insert_after);
4927       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4928                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4929                  &indx_after_incr);
4930       incr = gsi_stmt (incr_gsi);
4931       loop_vinfo->add_stmt (incr);
4932
4933       /* Copy the points-to information if it exists. */
4934       if (DR_PTR_INFO (dr))
4935         {
4936           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4937           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4938         }
4939       if (ptr_incr)
4940         *ptr_incr = incr;
4941
4942       return indx_before_incr;
4943     }
4944   else
4945     gcc_unreachable ();
4946 }
4947
4948
4949 /* Function bump_vector_ptr
4950
4951    Increment a pointer (to a vector type) by vector-size. If requested,
4952    i.e. if PTR-INCR is given, then also connect the new increment stmt
4953    to the existing def-use update-chain of the pointer, by modifying
4954    the PTR_INCR as illustrated below:
4955
4956    The pointer def-use update-chain before this function:
4957                         DATAREF_PTR = phi (p_0, p_2)
4958                         ....
4959         PTR_INCR:       p_2 = DATAREF_PTR + step
4960
4961    The pointer def-use update-chain after this function:
4962                         DATAREF_PTR = phi (p_0, p_2)
4963                         ....
4964                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4965                         ....
4966         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4967
4968    Input:
4969    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4970                  in the loop.
4971    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4972               the loop.  The increment amount across iterations is expected
4973               to be vector_size.
4974    BSI - location where the new update stmt is to be placed.
4975    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4976    BUMP - optional. The offset by which to bump the pointer. If not given,
4977           the offset is assumed to be vector_size.
4978
4979    Output: Return NEW_DATAREF_PTR as illustrated above.
4980
4981 */
4982
4983 tree
4984 bump_vector_ptr (vec_info *vinfo,
4985                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4986                  stmt_vec_info stmt_info, tree bump)
4987 {
4988   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4989   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4990   tree update = TYPE_SIZE_UNIT (vectype);
4991   gassign *incr_stmt;
4992   ssa_op_iter iter;
4993   use_operand_p use_p;
4994   tree new_dataref_ptr;
4995
4996   if (bump)
4997     update = bump;
4998
4999   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5000     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5001   else
5002     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5003   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5004                                    dataref_ptr, update);
5005   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5006
5007   /* Copy the points-to information if it exists. */
5008   if (DR_PTR_INFO (dr))
5009     {
5010       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5011       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5012     }
5013
5014   if (!ptr_incr)
5015     return new_dataref_ptr;
5016
5017   /* Update the vector-pointer's cross-iteration increment.  */
5018   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5019     {
5020       tree use = USE_FROM_PTR (use_p);
5021
5022       if (use == dataref_ptr)
5023         SET_USE (use_p, new_dataref_ptr);
5024       else
5025         gcc_assert (operand_equal_p (use, update, 0));
5026     }
5027
5028   return new_dataref_ptr;
5029 }
5030
5031
5032 /* Copy memory reference info such as base/clique from the SRC reference
5033    to the DEST MEM_REF.  */
5034
5035 void
5036 vect_copy_ref_info (tree dest, tree src)
5037 {
5038   if (TREE_CODE (dest) != MEM_REF)
5039     return;
5040
5041   tree src_base = src;
5042   while (handled_component_p (src_base))
5043     src_base = TREE_OPERAND (src_base, 0);
5044   if (TREE_CODE (src_base) != MEM_REF
5045       && TREE_CODE (src_base) != TARGET_MEM_REF)
5046     return;
5047
5048   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5049   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5050 }
5051
5052
5053 /* Function vect_create_destination_var.
5054
5055    Create a new temporary of type VECTYPE.  */
5056
5057 tree
5058 vect_create_destination_var (tree scalar_dest, tree vectype)
5059 {
5060   tree vec_dest;
5061   const char *name;
5062   char *new_name;
5063   tree type;
5064   enum vect_var_kind kind;
5065
5066   kind = vectype
5067     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5068     ? vect_mask_var
5069     : vect_simple_var
5070     : vect_scalar_var;
5071   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5072
5073   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5074
5075   name = get_name (scalar_dest);
5076   if (name)
5077     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5078   else
5079     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5080   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5081   free (new_name);
5082
5083   return vec_dest;
5084 }
5085
5086 /* Function vect_grouped_store_supported.
5087
5088    Returns TRUE if interleave high and interleave low permutations
5089    are supported, and FALSE otherwise.  */
5090
5091 bool
5092 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5093 {
5094   machine_mode mode = TYPE_MODE (vectype);
5095
5096   /* vect_permute_store_chain requires the group size to be equal to 3 or
5097      be a power of two.  */
5098   if (count != 3 && exact_log2 (count) == -1)
5099     {
5100       if (dump_enabled_p ())
5101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102                          "the size of the group of accesses"
5103                          " is not a power of 2 or not eqaul to 3\n");
5104       return false;
5105     }
5106
5107   /* Check that the permutation is supported.  */
5108   if (VECTOR_MODE_P (mode))
5109     {
5110       unsigned int i;
5111       if (count == 3)
5112         {
5113           unsigned int j0 = 0, j1 = 0, j2 = 0;
5114           unsigned int i, j;
5115
5116           unsigned int nelt;
5117           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5118             {
5119               if (dump_enabled_p ())
5120                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5121                                  "cannot handle groups of 3 stores for"
5122                                  " variable-length vectors\n");
5123               return false;
5124             }
5125
5126           vec_perm_builder sel (nelt, nelt, 1);
5127           sel.quick_grow (nelt);
5128           vec_perm_indices indices;
5129           for (j = 0; j < 3; j++)
5130             {
5131               int nelt0 = ((3 - j) * nelt) % 3;
5132               int nelt1 = ((3 - j) * nelt + 1) % 3;
5133               int nelt2 = ((3 - j) * nelt + 2) % 3;
5134               for (i = 0; i < nelt; i++)
5135                 {
5136                   if (3 * i + nelt0 < nelt)
5137                     sel[3 * i + nelt0] = j0++;
5138                   if (3 * i + nelt1 < nelt)
5139                     sel[3 * i + nelt1] = nelt + j1++;
5140                   if (3 * i + nelt2 < nelt)
5141                     sel[3 * i + nelt2] = 0;
5142                 }
5143               indices.new_vector (sel, 2, nelt);
5144               if (!can_vec_perm_const_p (mode, indices))
5145                 {
5146                   if (dump_enabled_p ())
5147                     dump_printf (MSG_MISSED_OPTIMIZATION,
5148                                  "permutation op not supported by target.\n");
5149                   return false;
5150                 }
5151
5152               for (i = 0; i < nelt; i++)
5153                 {
5154                   if (3 * i + nelt0 < nelt)
5155                     sel[3 * i + nelt0] = 3 * i + nelt0;
5156                   if (3 * i + nelt1 < nelt)
5157                     sel[3 * i + nelt1] = 3 * i + nelt1;
5158                   if (3 * i + nelt2 < nelt)
5159                     sel[3 * i + nelt2] = nelt + j2++;
5160                 }
5161               indices.new_vector (sel, 2, nelt);
5162               if (!can_vec_perm_const_p (mode, indices))
5163                 {
5164                   if (dump_enabled_p ())
5165                     dump_printf (MSG_MISSED_OPTIMIZATION,
5166                                  "permutation op not supported by target.\n");
5167                   return false;
5168                 }
5169             }
5170           return true;
5171         }
5172       else
5173         {
5174           /* If length is not equal to 3 then only power of 2 is supported.  */
5175           gcc_assert (pow2p_hwi (count));
5176           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5177
5178           /* The encoding has 2 interleaved stepped patterns.  */
5179           vec_perm_builder sel (nelt, 2, 3);
5180           sel.quick_grow (6);
5181           for (i = 0; i < 3; i++)
5182             {
5183               sel[i * 2] = i;
5184               sel[i * 2 + 1] = i + nelt;
5185             }
5186           vec_perm_indices indices (sel, 2, nelt);
5187           if (can_vec_perm_const_p (mode, indices))
5188             {
5189               for (i = 0; i < 6; i++)
5190                 sel[i] += exact_div (nelt, 2);
5191               indices.new_vector (sel, 2, nelt);
5192               if (can_vec_perm_const_p (mode, indices))
5193                 return true;
5194             }
5195         }
5196     }
5197
5198   if (dump_enabled_p ())
5199     dump_printf (MSG_MISSED_OPTIMIZATION,
5200                  "permutation op not supported by target.\n");
5201   return false;
5202 }
5203
5204
5205 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5206    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5207
5208 bool
5209 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5210                             bool masked_p)
5211 {
5212   if (masked_p)
5213     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5214                                          vec_mask_store_lanes_optab,
5215                                          vectype, count);
5216   else
5217     return vect_lanes_optab_supported_p ("vec_store_lanes",
5218                                          vec_store_lanes_optab,
5219                                          vectype, count);
5220 }
5221
5222
5223 /* Function vect_permute_store_chain.
5224
5225    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5226    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5227    the data correctly for the stores.  Return the final references for stores
5228    in RESULT_CHAIN.
5229
5230    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5231    The input is 4 vectors each containing 8 elements.  We assign a number to
5232    each element, the input sequence is:
5233
5234    1st vec:   0  1  2  3  4  5  6  7
5235    2nd vec:   8  9 10 11 12 13 14 15
5236    3rd vec:  16 17 18 19 20 21 22 23
5237    4th vec:  24 25 26 27 28 29 30 31
5238
5239    The output sequence should be:
5240
5241    1st vec:  0  8 16 24  1  9 17 25
5242    2nd vec:  2 10 18 26  3 11 19 27
5243    3rd vec:  4 12 20 28  5 13 21 30
5244    4th vec:  6 14 22 30  7 15 23 31
5245
5246    i.e., we interleave the contents of the four vectors in their order.
5247
5248    We use interleave_high/low instructions to create such output.  The input of
5249    each interleave_high/low operation is two vectors:
5250    1st vec    2nd vec
5251    0 1 2 3    4 5 6 7
5252    the even elements of the result vector are obtained left-to-right from the
5253    high/low elements of the first vector.  The odd elements of the result are
5254    obtained left-to-right from the high/low elements of the second vector.
5255    The output of interleave_high will be:   0 4 1 5
5256    and of interleave_low:                   2 6 3 7
5257
5258
5259    The permutation is done in log LENGTH stages.  In each stage interleave_high
5260    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5261    where the first argument is taken from the first half of DR_CHAIN and the
5262    second argument from it's second half.
5263    In our example,
5264
5265    I1: interleave_high (1st vec, 3rd vec)
5266    I2: interleave_low (1st vec, 3rd vec)
5267    I3: interleave_high (2nd vec, 4th vec)
5268    I4: interleave_low (2nd vec, 4th vec)
5269
5270    The output for the first stage is:
5271
5272    I1:  0 16  1 17  2 18  3 19
5273    I2:  4 20  5 21  6 22  7 23
5274    I3:  8 24  9 25 10 26 11 27
5275    I4: 12 28 13 29 14 30 15 31
5276
5277    The output of the second stage, i.e. the final result is:
5278
5279    I1:  0  8 16 24  1  9 17 25
5280    I2:  2 10 18 26  3 11 19 27
5281    I3:  4 12 20 28  5 13 21 30
5282    I4:  6 14 22 30  7 15 23 31.  */
5283
5284 void
5285 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5286                           unsigned int length,
5287                           stmt_vec_info stmt_info,
5288                           gimple_stmt_iterator *gsi,
5289                           vec<tree> *result_chain)
5290 {
5291   tree vect1, vect2, high, low;
5292   gimple *perm_stmt;
5293   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5294   tree perm_mask_low, perm_mask_high;
5295   tree data_ref;
5296   tree perm3_mask_low, perm3_mask_high;
5297   unsigned int i, j, n, log_length = exact_log2 (length);
5298
5299   result_chain->quick_grow (length);
5300   memcpy (result_chain->address (), dr_chain.address (),
5301           length * sizeof (tree));
5302
5303   if (length == 3)
5304     {
5305       /* vect_grouped_store_supported ensures that this is constant.  */
5306       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5307       unsigned int j0 = 0, j1 = 0, j2 = 0;
5308
5309       vec_perm_builder sel (nelt, nelt, 1);
5310       sel.quick_grow (nelt);
5311       vec_perm_indices indices;
5312       for (j = 0; j < 3; j++)
5313         {
5314           int nelt0 = ((3 - j) * nelt) % 3;
5315           int nelt1 = ((3 - j) * nelt + 1) % 3;
5316           int nelt2 = ((3 - j) * nelt + 2) % 3;
5317
5318           for (i = 0; i < nelt; i++)
5319             {
5320               if (3 * i + nelt0 < nelt)
5321                 sel[3 * i + nelt0] = j0++;
5322               if (3 * i + nelt1 < nelt)
5323                 sel[3 * i + nelt1] = nelt + j1++;
5324               if (3 * i + nelt2 < nelt)
5325                 sel[3 * i + nelt2] = 0;
5326             }
5327           indices.new_vector (sel, 2, nelt);
5328           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5329
5330           for (i = 0; i < nelt; i++)
5331             {
5332               if (3 * i + nelt0 < nelt)
5333                 sel[3 * i + nelt0] = 3 * i + nelt0;
5334               if (3 * i + nelt1 < nelt)
5335                 sel[3 * i + nelt1] = 3 * i + nelt1;
5336               if (3 * i + nelt2 < nelt)
5337                 sel[3 * i + nelt2] = nelt + j2++;
5338             }
5339           indices.new_vector (sel, 2, nelt);
5340           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5341
5342           vect1 = dr_chain[0];
5343           vect2 = dr_chain[1];
5344
5345           /* Create interleaving stmt:
5346              low = VEC_PERM_EXPR <vect1, vect2,
5347                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5348                                    j + 2, nelt + j + 2, *, ...}>  */
5349           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5350           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5351                                            vect2, perm3_mask_low);
5352           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5353
5354           vect1 = data_ref;
5355           vect2 = dr_chain[2];
5356           /* Create interleaving stmt:
5357              low = VEC_PERM_EXPR <vect1, vect2,
5358                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5359                                    6, 7, nelt + j + 2, ...}>  */
5360           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5361           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5362                                            vect2, perm3_mask_high);
5363           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5364           (*result_chain)[j] = data_ref;
5365         }
5366     }
5367   else
5368     {
5369       /* If length is not equal to 3 then only power of 2 is supported.  */
5370       gcc_assert (pow2p_hwi (length));
5371
5372       /* The encoding has 2 interleaved stepped patterns.  */
5373       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5374       vec_perm_builder sel (nelt, 2, 3);
5375       sel.quick_grow (6);
5376       for (i = 0; i < 3; i++)
5377         {
5378           sel[i * 2] = i;
5379           sel[i * 2 + 1] = i + nelt;
5380         }
5381         vec_perm_indices indices (sel, 2, nelt);
5382         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5383
5384         for (i = 0; i < 6; i++)
5385           sel[i] += exact_div (nelt, 2);
5386         indices.new_vector (sel, 2, nelt);
5387         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5388
5389         for (i = 0, n = log_length; i < n; i++)
5390           {
5391             for (j = 0; j < length/2; j++)
5392               {
5393                 vect1 = dr_chain[j];
5394                 vect2 = dr_chain[j+length/2];
5395
5396                 /* Create interleaving stmt:
5397                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5398                                                         ...}>  */
5399                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5400                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5401                                                  vect2, perm_mask_high);
5402                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5403                 (*result_chain)[2*j] = high;
5404
5405                 /* Create interleaving stmt:
5406                    low = VEC_PERM_EXPR <vect1, vect2,
5407                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5408                                          ...}>  */
5409                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5410                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5411                                                  vect2, perm_mask_low);
5412                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5413                 (*result_chain)[2*j+1] = low;
5414               }
5415             memcpy (dr_chain.address (), result_chain->address (),
5416                     length * sizeof (tree));
5417           }
5418     }
5419 }
5420
5421 /* Function vect_setup_realignment
5422
5423    This function is called when vectorizing an unaligned load using
5424    the dr_explicit_realign[_optimized] scheme.
5425    This function generates the following code at the loop prolog:
5426
5427       p = initial_addr;
5428    x  msq_init = *(floor(p));   # prolog load
5429       realignment_token = call target_builtin;
5430     loop:
5431    x  msq = phi (msq_init, ---)
5432
5433    The stmts marked with x are generated only for the case of
5434    dr_explicit_realign_optimized.
5435
5436    The code above sets up a new (vector) pointer, pointing to the first
5437    location accessed by STMT_INFO, and a "floor-aligned" load using that
5438    pointer.  It also generates code to compute the "realignment-token"
5439    (if the relevant target hook was defined), and creates a phi-node at the
5440    loop-header bb whose arguments are the result of the prolog-load (created
5441    by this function) and the result of a load that takes place in the loop
5442    (to be created by the caller to this function).
5443
5444    For the case of dr_explicit_realign_optimized:
5445    The caller to this function uses the phi-result (msq) to create the
5446    realignment code inside the loop, and sets up the missing phi argument,
5447    as follows:
5448     loop:
5449       msq = phi (msq_init, lsq)
5450       lsq = *(floor(p'));        # load in loop
5451       result = realign_load (msq, lsq, realignment_token);
5452
5453    For the case of dr_explicit_realign:
5454     loop:
5455       msq = *(floor(p));        # load in loop
5456       p' = p + (VS-1);
5457       lsq = *(floor(p'));       # load in loop
5458       result = realign_load (msq, lsq, realignment_token);
5459
5460    Input:
5461    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5462                a memory location that may be unaligned.
5463    BSI - place where new code is to be inserted.
5464    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5465                               is used.
5466
5467    Output:
5468    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5469                        target hook, if defined.
5470    Return value - the result of the loop-header phi node.  */
5471
5472 tree
5473 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5474                         gimple_stmt_iterator *gsi, tree *realignment_token,
5475                         enum dr_alignment_support alignment_support_scheme,
5476                         tree init_addr,
5477                         class loop **at_loop)
5478 {
5479   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5480   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5481   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5482   struct data_reference *dr = dr_info->dr;
5483   class loop *loop = NULL;
5484   edge pe = NULL;
5485   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5486   tree vec_dest;
5487   gimple *inc;
5488   tree ptr;
5489   tree data_ref;
5490   basic_block new_bb;
5491   tree msq_init = NULL_TREE;
5492   tree new_temp;
5493   gphi *phi_stmt;
5494   tree msq = NULL_TREE;
5495   gimple_seq stmts = NULL;
5496   bool compute_in_loop = false;
5497   bool nested_in_vect_loop = false;
5498   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5499   class loop *loop_for_initial_load = NULL;
5500
5501   if (loop_vinfo)
5502     {
5503       loop = LOOP_VINFO_LOOP (loop_vinfo);
5504       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5505     }
5506
5507   gcc_assert (alignment_support_scheme == dr_explicit_realign
5508               || alignment_support_scheme == dr_explicit_realign_optimized);
5509
5510   /* We need to generate three things:
5511      1. the misalignment computation
5512      2. the extra vector load (for the optimized realignment scheme).
5513      3. the phi node for the two vectors from which the realignment is
5514       done (for the optimized realignment scheme).  */
5515
5516   /* 1. Determine where to generate the misalignment computation.
5517
5518      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5519      calculation will be generated by this function, outside the loop (in the
5520      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5521      caller, inside the loop.
5522
5523      Background: If the misalignment remains fixed throughout the iterations of
5524      the loop, then both realignment schemes are applicable, and also the
5525      misalignment computation can be done outside LOOP.  This is because we are
5526      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5527      are a multiple of VS (the Vector Size), and therefore the misalignment in
5528      different vectorized LOOP iterations is always the same.
5529      The problem arises only if the memory access is in an inner-loop nested
5530      inside LOOP, which is now being vectorized using outer-loop vectorization.
5531      This is the only case when the misalignment of the memory access may not
5532      remain fixed throughout the iterations of the inner-loop (as explained in
5533      detail in vect_supportable_dr_alignment).  In this case, not only is the
5534      optimized realignment scheme not applicable, but also the misalignment
5535      computation (and generation of the realignment token that is passed to
5536      REALIGN_LOAD) have to be done inside the loop.
5537
5538      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5539      or not, which in turn determines if the misalignment is computed inside
5540      the inner-loop, or outside LOOP.  */
5541
5542   if (init_addr != NULL_TREE || !loop_vinfo)
5543     {
5544       compute_in_loop = true;
5545       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5546     }
5547
5548
5549   /* 2. Determine where to generate the extra vector load.
5550
5551      For the optimized realignment scheme, instead of generating two vector
5552      loads in each iteration, we generate a single extra vector load in the
5553      preheader of the loop, and in each iteration reuse the result of the
5554      vector load from the previous iteration.  In case the memory access is in
5555      an inner-loop nested inside LOOP, which is now being vectorized using
5556      outer-loop vectorization, we need to determine whether this initial vector
5557      load should be generated at the preheader of the inner-loop, or can be
5558      generated at the preheader of LOOP.  If the memory access has no evolution
5559      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5560      to be generated inside LOOP (in the preheader of the inner-loop).  */
5561
5562   if (nested_in_vect_loop)
5563     {
5564       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5565       bool invariant_in_outerloop =
5566             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5567       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5568     }
5569   else
5570     loop_for_initial_load = loop;
5571   if (at_loop)
5572     *at_loop = loop_for_initial_load;
5573
5574   if (loop_for_initial_load)
5575     pe = loop_preheader_edge (loop_for_initial_load);
5576
5577   /* 3. For the case of the optimized realignment, create the first vector
5578       load at the loop preheader.  */
5579
5580   if (alignment_support_scheme == dr_explicit_realign_optimized)
5581     {
5582       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5583       gassign *new_stmt;
5584
5585       gcc_assert (!compute_in_loop);
5586       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5587       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5588                                       loop_for_initial_load, NULL_TREE,
5589                                       &init_addr, NULL, &inc, true);
5590       if (TREE_CODE (ptr) == SSA_NAME)
5591         new_temp = copy_ssa_name (ptr);
5592       else
5593         new_temp = make_ssa_name (TREE_TYPE (ptr));
5594       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5595       tree type = TREE_TYPE (ptr);
5596       new_stmt = gimple_build_assign
5597                    (new_temp, BIT_AND_EXPR, ptr,
5598                     fold_build2 (MINUS_EXPR, type,
5599                                  build_int_cst (type, 0),
5600                                  build_int_cst (type, align)));
5601       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5602       gcc_assert (!new_bb);
5603       data_ref
5604         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5605                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5606       vect_copy_ref_info (data_ref, DR_REF (dr));
5607       new_stmt = gimple_build_assign (vec_dest, data_ref);
5608       new_temp = make_ssa_name (vec_dest, new_stmt);
5609       gimple_assign_set_lhs (new_stmt, new_temp);
5610       if (pe)
5611         {
5612           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5613           gcc_assert (!new_bb);
5614         }
5615       else
5616          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5617
5618       msq_init = gimple_assign_lhs (new_stmt);
5619     }
5620
5621   /* 4. Create realignment token using a target builtin, if available.
5622       It is done either inside the containing loop, or before LOOP (as
5623       determined above).  */
5624
5625   if (targetm.vectorize.builtin_mask_for_load)
5626     {
5627       gcall *new_stmt;
5628       tree builtin_decl;
5629
5630       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5631       if (!init_addr)
5632         {
5633           /* Generate the INIT_ADDR computation outside LOOP.  */
5634           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5635                                                             stmt_info, &stmts,
5636                                                             NULL_TREE);
5637           if (loop)
5638             {
5639               pe = loop_preheader_edge (loop);
5640               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5641               gcc_assert (!new_bb);
5642             }
5643           else
5644              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5645         }
5646
5647       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5648       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5649       vec_dest =
5650         vect_create_destination_var (scalar_dest,
5651                                      gimple_call_return_type (new_stmt));
5652       new_temp = make_ssa_name (vec_dest, new_stmt);
5653       gimple_call_set_lhs (new_stmt, new_temp);
5654
5655       if (compute_in_loop)
5656         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5657       else
5658         {
5659           /* Generate the misalignment computation outside LOOP.  */
5660           pe = loop_preheader_edge (loop);
5661           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5662           gcc_assert (!new_bb);
5663         }
5664
5665       *realignment_token = gimple_call_lhs (new_stmt);
5666
5667       /* The result of the CALL_EXPR to this builtin is determined from
5668          the value of the parameter and no global variables are touched
5669          which makes the builtin a "const" function.  Requiring the
5670          builtin to have the "const" attribute makes it unnecessary
5671          to call mark_call_clobbered.  */
5672       gcc_assert (TREE_READONLY (builtin_decl));
5673     }
5674
5675   if (alignment_support_scheme == dr_explicit_realign)
5676     return msq;
5677
5678   gcc_assert (!compute_in_loop);
5679   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5680
5681
5682   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5683
5684   pe = loop_preheader_edge (containing_loop);
5685   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5686   msq = make_ssa_name (vec_dest);
5687   phi_stmt = create_phi_node (msq, containing_loop->header);
5688   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5689
5690   return msq;
5691 }
5692
5693
5694 /* Function vect_grouped_load_supported.
5695
5696    COUNT is the size of the load group (the number of statements plus the
5697    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5698    only one statement, with a gap of COUNT - 1.
5699
5700    Returns true if a suitable permute exists.  */
5701
5702 bool
5703 vect_grouped_load_supported (tree vectype, bool single_element_p,
5704                              unsigned HOST_WIDE_INT count)
5705 {
5706   machine_mode mode = TYPE_MODE (vectype);
5707
5708   /* If this is single-element interleaving with an element distance
5709      that leaves unused vector loads around punt - we at least create
5710      very sub-optimal code in that case (and blow up memory,
5711      see PR65518).  */
5712   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5713     {
5714       if (dump_enabled_p ())
5715         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716                          "single-element interleaving not supported "
5717                          "for not adjacent vector loads\n");
5718       return false;
5719     }
5720
5721   /* vect_permute_load_chain requires the group size to be equal to 3 or
5722      be a power of two.  */
5723   if (count != 3 && exact_log2 (count) == -1)
5724     {
5725       if (dump_enabled_p ())
5726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727                          "the size of the group of accesses"
5728                          " is not a power of 2 or not equal to 3\n");
5729       return false;
5730     }
5731
5732   /* Check that the permutation is supported.  */
5733   if (VECTOR_MODE_P (mode))
5734     {
5735       unsigned int i, j;
5736       if (count == 3)
5737         {
5738           unsigned int nelt;
5739           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5740             {
5741               if (dump_enabled_p ())
5742                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5743                                  "cannot handle groups of 3 loads for"
5744                                  " variable-length vectors\n");
5745               return false;
5746             }
5747
5748           vec_perm_builder sel (nelt, nelt, 1);
5749           sel.quick_grow (nelt);
5750           vec_perm_indices indices;
5751           unsigned int k;
5752           for (k = 0; k < 3; k++)
5753             {
5754               for (i = 0; i < nelt; i++)
5755                 if (3 * i + k < 2 * nelt)
5756                   sel[i] = 3 * i + k;
5757                 else
5758                   sel[i] = 0;
5759               indices.new_vector (sel, 2, nelt);
5760               if (!can_vec_perm_const_p (mode, indices))
5761                 {
5762                   if (dump_enabled_p ())
5763                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                                      "shuffle of 3 loads is not supported by"
5765                                      " target\n");
5766                   return false;
5767                 }
5768               for (i = 0, j = 0; i < nelt; i++)
5769                 if (3 * i + k < 2 * nelt)
5770                   sel[i] = i;
5771                 else
5772                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5773               indices.new_vector (sel, 2, nelt);
5774               if (!can_vec_perm_const_p (mode, indices))
5775                 {
5776                   if (dump_enabled_p ())
5777                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778                                      "shuffle of 3 loads is not supported by"
5779                                      " target\n");
5780                   return false;
5781                 }
5782             }
5783           return true;
5784         }
5785       else
5786         {
5787           /* If length is not equal to 3 then only power of 2 is supported.  */
5788           gcc_assert (pow2p_hwi (count));
5789           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5790
5791           /* The encoding has a single stepped pattern.  */
5792           vec_perm_builder sel (nelt, 1, 3);
5793           sel.quick_grow (3);
5794           for (i = 0; i < 3; i++)
5795             sel[i] = i * 2;
5796           vec_perm_indices indices (sel, 2, nelt);
5797           if (can_vec_perm_const_p (mode, indices))
5798             {
5799               for (i = 0; i < 3; i++)
5800                 sel[i] = i * 2 + 1;
5801               indices.new_vector (sel, 2, nelt);
5802               if (can_vec_perm_const_p (mode, indices))
5803                 return true;
5804             }
5805         }
5806     }
5807
5808   if (dump_enabled_p ())
5809     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5810                      "extract even/odd not supported by target\n");
5811   return false;
5812 }
5813
5814 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5815    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5816
5817 bool
5818 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5819                            bool masked_p)
5820 {
5821   if (masked_p)
5822     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5823                                          vec_mask_load_lanes_optab,
5824                                          vectype, count);
5825   else
5826     return vect_lanes_optab_supported_p ("vec_load_lanes",
5827                                          vec_load_lanes_optab,
5828                                          vectype, count);
5829 }
5830
5831 /* Function vect_permute_load_chain.
5832
5833    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5834    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5835    the input data correctly.  Return the final references for loads in
5836    RESULT_CHAIN.
5837
5838    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5839    The input is 4 vectors each containing 8 elements. We assign a number to each
5840    element, the input sequence is:
5841
5842    1st vec:   0  1  2  3  4  5  6  7
5843    2nd vec:   8  9 10 11 12 13 14 15
5844    3rd vec:  16 17 18 19 20 21 22 23
5845    4th vec:  24 25 26 27 28 29 30 31
5846
5847    The output sequence should be:
5848
5849    1st vec:  0 4  8 12 16 20 24 28
5850    2nd vec:  1 5  9 13 17 21 25 29
5851    3rd vec:  2 6 10 14 18 22 26 30
5852    4th vec:  3 7 11 15 19 23 27 31
5853
5854    i.e., the first output vector should contain the first elements of each
5855    interleaving group, etc.
5856
5857    We use extract_even/odd instructions to create such output.  The input of
5858    each extract_even/odd operation is two vectors
5859    1st vec    2nd vec
5860    0 1 2 3    4 5 6 7
5861
5862    and the output is the vector of extracted even/odd elements.  The output of
5863    extract_even will be:   0 2 4 6
5864    and of extract_odd:     1 3 5 7
5865
5866
5867    The permutation is done in log LENGTH stages.  In each stage extract_even
5868    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5869    their order.  In our example,
5870
5871    E1: extract_even (1st vec, 2nd vec)
5872    E2: extract_odd (1st vec, 2nd vec)
5873    E3: extract_even (3rd vec, 4th vec)
5874    E4: extract_odd (3rd vec, 4th vec)
5875
5876    The output for the first stage will be:
5877
5878    E1:  0  2  4  6  8 10 12 14
5879    E2:  1  3  5  7  9 11 13 15
5880    E3: 16 18 20 22 24 26 28 30
5881    E4: 17 19 21 23 25 27 29 31
5882
5883    In order to proceed and create the correct sequence for the next stage (or
5884    for the correct output, if the second stage is the last one, as in our
5885    example), we first put the output of extract_even operation and then the
5886    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5887    The input for the second stage is:
5888
5889    1st vec (E1):  0  2  4  6  8 10 12 14
5890    2nd vec (E3): 16 18 20 22 24 26 28 30
5891    3rd vec (E2):  1  3  5  7  9 11 13 15
5892    4th vec (E4): 17 19 21 23 25 27 29 31
5893
5894    The output of the second stage:
5895
5896    E1: 0 4  8 12 16 20 24 28
5897    E2: 2 6 10 14 18 22 26 30
5898    E3: 1 5  9 13 17 21 25 29
5899    E4: 3 7 11 15 19 23 27 31
5900
5901    And RESULT_CHAIN after reordering:
5902
5903    1st vec (E1):  0 4  8 12 16 20 24 28
5904    2nd vec (E3):  1 5  9 13 17 21 25 29
5905    3rd vec (E2):  2 6 10 14 18 22 26 30
5906    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5907
5908 static void
5909 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5910                          unsigned int length,
5911                          stmt_vec_info stmt_info,
5912                          gimple_stmt_iterator *gsi,
5913                          vec<tree> *result_chain)
5914 {
5915   tree data_ref, first_vect, second_vect;
5916   tree perm_mask_even, perm_mask_odd;
5917   tree perm3_mask_low, perm3_mask_high;
5918   gimple *perm_stmt;
5919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5920   unsigned int i, j, log_length = exact_log2 (length);
5921
5922   result_chain->quick_grow (length);
5923   memcpy (result_chain->address (), dr_chain.address (),
5924           length * sizeof (tree));
5925
5926   if (length == 3)
5927     {
5928       /* vect_grouped_load_supported ensures that this is constant.  */
5929       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5930       unsigned int k;
5931
5932       vec_perm_builder sel (nelt, nelt, 1);
5933       sel.quick_grow (nelt);
5934       vec_perm_indices indices;
5935       for (k = 0; k < 3; k++)
5936         {
5937           for (i = 0; i < nelt; i++)
5938             if (3 * i + k < 2 * nelt)
5939               sel[i] = 3 * i + k;
5940             else
5941               sel[i] = 0;
5942           indices.new_vector (sel, 2, nelt);
5943           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5944
5945           for (i = 0, j = 0; i < nelt; i++)
5946             if (3 * i + k < 2 * nelt)
5947               sel[i] = i;
5948             else
5949               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5950           indices.new_vector (sel, 2, nelt);
5951           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5952
5953           first_vect = dr_chain[0];
5954           second_vect = dr_chain[1];
5955
5956           /* Create interleaving stmt (low part of):
5957              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958                                                              ...}>  */
5959           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5960           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5961                                            second_vect, perm3_mask_low);
5962           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5963
5964           /* Create interleaving stmt (high part of):
5965              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5966                                                               ...}>  */
5967           first_vect = data_ref;
5968           second_vect = dr_chain[2];
5969           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5970           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5971                                            second_vect, perm3_mask_high);
5972           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5973           (*result_chain)[k] = data_ref;
5974         }
5975     }
5976   else
5977     {
5978       /* If length is not equal to 3 then only power of 2 is supported.  */
5979       gcc_assert (pow2p_hwi (length));
5980
5981       /* The encoding has a single stepped pattern.  */
5982       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5983       vec_perm_builder sel (nelt, 1, 3);
5984       sel.quick_grow (3);
5985       for (i = 0; i < 3; ++i)
5986         sel[i] = i * 2;
5987       vec_perm_indices indices (sel, 2, nelt);
5988       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5989
5990       for (i = 0; i < 3; ++i)
5991         sel[i] = i * 2 + 1;
5992       indices.new_vector (sel, 2, nelt);
5993       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5994
5995       for (i = 0; i < log_length; i++)
5996         {
5997           for (j = 0; j < length; j += 2)
5998             {
5999               first_vect = dr_chain[j];
6000               second_vect = dr_chain[j+1];
6001
6002               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6003               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6004               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005                                                first_vect, second_vect,
6006                                                perm_mask_even);
6007               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6008               (*result_chain)[j/2] = data_ref;
6009
6010               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6011               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6012               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6013                                                first_vect, second_vect,
6014                                                perm_mask_odd);
6015               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6016               (*result_chain)[j/2+length/2] = data_ref;
6017             }
6018           memcpy (dr_chain.address (), result_chain->address (),
6019                   length * sizeof (tree));
6020         }
6021     }
6022 }
6023
6024 /* Function vect_shift_permute_load_chain.
6025
6026    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6027    sequence of stmts to reorder the input data accordingly.
6028    Return the final references for loads in RESULT_CHAIN.
6029    Return true if successed, false otherwise.
6030
6031    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6032    The input is 3 vectors each containing 8 elements.  We assign a
6033    number to each element, the input sequence is:
6034
6035    1st vec:   0  1  2  3  4  5  6  7
6036    2nd vec:   8  9 10 11 12 13 14 15
6037    3rd vec:  16 17 18 19 20 21 22 23
6038
6039    The output sequence should be:
6040
6041    1st vec:  0 3 6  9 12 15 18 21
6042    2nd vec:  1 4 7 10 13 16 19 22
6043    3rd vec:  2 5 8 11 14 17 20 23
6044
6045    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6046
6047    First we shuffle all 3 vectors to get correct elements order:
6048
6049    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6050    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6051    3rd vec:  (16 19 22) (17 20 23) (18 21)
6052
6053    Next we unite and shift vector 3 times:
6054
6055    1st step:
6056      shift right by 6 the concatenation of:
6057      "1st vec" and  "2nd vec"
6058        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6059      "2nd vec" and  "3rd vec"
6060        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6061      "3rd vec" and  "1st vec"
6062        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6063                              | New vectors                   |
6064
6065      So that now new vectors are:
6066
6067      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6068      2nd vec:  (10 13) (16 19 22) (17 20 23)
6069      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6070
6071    2nd step:
6072      shift right by 5 the concatenation of:
6073      "1st vec" and  "3rd vec"
6074        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6075      "2nd vec" and  "1st vec"
6076        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6077      "3rd vec" and  "2nd vec"
6078        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6079                           | New vectors                   |
6080
6081      So that now new vectors are:
6082
6083      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6084      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6085      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6086
6087    3rd step:
6088      shift right by 5 the concatenation of:
6089      "1st vec" and  "1st vec"
6090        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6091      shift right by 3 the concatenation of:
6092      "2nd vec" and  "2nd vec"
6093                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6094                           | New vectors                   |
6095
6096      So that now all vectors are READY:
6097      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6098      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6099      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6100
6101    This algorithm is faster than one in vect_permute_load_chain if:
6102      1.  "shift of a concatination" is faster than general permutation.
6103          This is usually so.
6104      2.  The TARGET machine can't execute vector instructions in parallel.
6105          This is because each step of the algorithm depends on previous.
6106          The algorithm in vect_permute_load_chain is much more parallel.
6107
6108    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6109 */
6110
6111 static bool
6112 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6113                                unsigned int length,
6114                                stmt_vec_info stmt_info,
6115                                gimple_stmt_iterator *gsi,
6116                                vec<tree> *result_chain)
6117 {
6118   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6119   tree perm2_mask1, perm2_mask2, perm3_mask;
6120   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6121   gimple *perm_stmt;
6122
6123   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6124   unsigned int i;
6125   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6126
6127   unsigned HOST_WIDE_INT nelt, vf;
6128   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6129       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6130     /* Not supported for variable-length vectors.  */
6131     return false;
6132
6133   vec_perm_builder sel (nelt, nelt, 1);
6134   sel.quick_grow (nelt);
6135
6136   result_chain->quick_grow (length);
6137   memcpy (result_chain->address (), dr_chain.address (),
6138           length * sizeof (tree));
6139
6140   if (pow2p_hwi (length) && vf > 4)
6141     {
6142       unsigned int j, log_length = exact_log2 (length);
6143       for (i = 0; i < nelt / 2; ++i)
6144         sel[i] = i * 2;
6145       for (i = 0; i < nelt / 2; ++i)
6146         sel[nelt / 2 + i] = i * 2 + 1;
6147       vec_perm_indices indices (sel, 2, nelt);
6148       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6149         {
6150           if (dump_enabled_p ())
6151             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6152                              "shuffle of 2 fields structure is not \
6153                               supported by target\n");
6154           return false;
6155         }
6156       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6157
6158       for (i = 0; i < nelt / 2; ++i)
6159         sel[i] = i * 2 + 1;
6160       for (i = 0; i < nelt / 2; ++i)
6161         sel[nelt / 2 + i] = i * 2;
6162       indices.new_vector (sel, 2, nelt);
6163       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164         {
6165           if (dump_enabled_p ())
6166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167                              "shuffle of 2 fields structure is not \
6168                               supported by target\n");
6169           return false;
6170         }
6171       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6172
6173       /* Generating permutation constant to shift all elements.
6174          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6175       for (i = 0; i < nelt; i++)
6176         sel[i] = nelt / 2 + i;
6177       indices.new_vector (sel, 2, nelt);
6178       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6179         {
6180           if (dump_enabled_p ())
6181             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182                              "shift permutation is not supported by target\n");
6183           return false;
6184         }
6185       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6186
6187       /* Generating permutation constant to select vector from 2.
6188          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6189       for (i = 0; i < nelt / 2; i++)
6190         sel[i] = i;
6191       for (i = nelt / 2; i < nelt; i++)
6192         sel[i] = nelt + i;
6193       indices.new_vector (sel, 2, nelt);
6194       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6195         {
6196           if (dump_enabled_p ())
6197             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198                              "select is not supported by target\n");
6199           return false;
6200         }
6201       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6202
6203       for (i = 0; i < log_length; i++)
6204         {
6205           for (j = 0; j < length; j += 2)
6206             {
6207               first_vect = dr_chain[j];
6208               second_vect = dr_chain[j + 1];
6209
6210               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6211               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212                                                first_vect, first_vect,
6213                                                perm2_mask1);
6214               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6215               vect[0] = data_ref;
6216
6217               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6218               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219                                                second_vect, second_vect,
6220                                                perm2_mask2);
6221               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6222               vect[1] = data_ref;
6223
6224               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6225               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6226                                                vect[0], vect[1], shift1_mask);
6227               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6228               (*result_chain)[j/2 + length/2] = data_ref;
6229
6230               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6231               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6232                                                vect[0], vect[1], select_mask);
6233               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6234               (*result_chain)[j/2] = data_ref;
6235             }
6236           memcpy (dr_chain.address (), result_chain->address (),
6237                   length * sizeof (tree));
6238         }
6239       return true;
6240     }
6241   if (length == 3 && vf > 2)
6242     {
6243       unsigned int k = 0, l = 0;
6244
6245       /* Generating permutation constant to get all elements in rigth order.
6246          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6247       for (i = 0; i < nelt; i++)
6248         {
6249           if (3 * k + (l % 3) >= nelt)
6250             {
6251               k = 0;
6252               l += (3 - (nelt % 3));
6253             }
6254           sel[i] = 3 * k + (l % 3);
6255           k++;
6256         }
6257       vec_perm_indices indices (sel, 2, nelt);
6258       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259         {
6260           if (dump_enabled_p ())
6261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262                              "shuffle of 3 fields structure is not \
6263                               supported by target\n");
6264           return false;
6265         }
6266       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6267
6268       /* Generating permutation constant to shift all elements.
6269          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6270       for (i = 0; i < nelt; i++)
6271         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6272       indices.new_vector (sel, 2, nelt);
6273       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274         {
6275           if (dump_enabled_p ())
6276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277                              "shift permutation is not supported by target\n");
6278           return false;
6279         }
6280       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6281
6282       /* Generating permutation constant to shift all elements.
6283          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6284       for (i = 0; i < nelt; i++)
6285         sel[i] = 2 * (nelt / 3) + 1 + i;
6286       indices.new_vector (sel, 2, nelt);
6287       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6288         {
6289           if (dump_enabled_p ())
6290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6291                              "shift permutation is not supported by target\n");
6292           return false;
6293         }
6294       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6295
6296       /* Generating permutation constant to shift all elements.
6297          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6298       for (i = 0; i < nelt; i++)
6299         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6300       indices.new_vector (sel, 2, nelt);
6301       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6302         {
6303           if (dump_enabled_p ())
6304             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305                              "shift permutation is not supported by target\n");
6306           return false;
6307         }
6308       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6309
6310       /* Generating permutation constant to shift all elements.
6311          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6312       for (i = 0; i < nelt; i++)
6313         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6314       indices.new_vector (sel, 2, nelt);
6315       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6316         {
6317           if (dump_enabled_p ())
6318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6319                              "shift permutation is not supported by target\n");
6320           return false;
6321         }
6322       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6323
6324       for (k = 0; k < 3; k++)
6325         {
6326           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6327           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6328                                            dr_chain[k], dr_chain[k],
6329                                            perm3_mask);
6330           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6331           vect[k] = data_ref;
6332         }
6333
6334       for (k = 0; k < 3; k++)
6335         {
6336           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6337           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6338                                            vect[k % 3], vect[(k + 1) % 3],
6339                                            shift1_mask);
6340           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6341           vect_shift[k] = data_ref;
6342         }
6343
6344       for (k = 0; k < 3; k++)
6345         {
6346           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6347           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6348                                            vect_shift[(4 - k) % 3],
6349                                            vect_shift[(3 - k) % 3],
6350                                            shift2_mask);
6351           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6352           vect[k] = data_ref;
6353         }
6354
6355       (*result_chain)[3 - (nelt % 3)] = vect[2];
6356
6357       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6358       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6359                                        vect[0], shift3_mask);
6360       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6361       (*result_chain)[nelt % 3] = data_ref;
6362
6363       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6364       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6365                                        vect[1], shift4_mask);
6366       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6367       (*result_chain)[0] = data_ref;
6368       return true;
6369     }
6370   return false;
6371 }
6372
6373 /* Function vect_transform_grouped_load.
6374
6375    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6376    to perform their permutation and ascribe the result vectorized statements to
6377    the scalar statements.
6378 */
6379
6380 void
6381 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6382                              vec<tree> dr_chain,
6383                              int size, gimple_stmt_iterator *gsi)
6384 {
6385   machine_mode mode;
6386   vec<tree> result_chain = vNULL;
6387
6388   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6389      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6390      vectors, that are ready for vector computation.  */
6391   result_chain.create (size);
6392
6393   /* If reassociation width for vector type is 2 or greater target machine can
6394      execute 2 or more vector instructions in parallel.  Otherwise try to
6395      get chain for loads group using vect_shift_permute_load_chain.  */
6396   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6397   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6398       || pow2p_hwi (size)
6399       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6400                                          gsi, &result_chain))
6401     vect_permute_load_chain (vinfo, dr_chain,
6402                              size, stmt_info, gsi, &result_chain);
6403   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6404   result_chain.release ();
6405 }
6406
6407 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6408    generated as part of the vectorization of STMT_INFO.  Assign the statement
6409    for each vector to the associated scalar statement.  */
6410
6411 void
6412 vect_record_grouped_load_vectors (vec_info *vinfo, stmt_vec_info stmt_info,
6413                                   vec<tree> result_chain)
6414 {
6415   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6416   unsigned int i, gap_count;
6417   tree tmp_data_ref;
6418
6419   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6420      Since we scan the chain starting from it's first node, their order
6421      corresponds the order of data-refs in RESULT_CHAIN.  */
6422   stmt_vec_info next_stmt_info = first_stmt_info;
6423   gap_count = 1;
6424   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6425     {
6426       if (!next_stmt_info)
6427         break;
6428
6429       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6430        code elimination pass later.  No need to check for the first stmt in
6431        the group, since it always exists.
6432        DR_GROUP_GAP is the number of steps in elements from the previous
6433        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6434        correspond to the gaps.  */
6435       if (next_stmt_info != first_stmt_info
6436           && gap_count < DR_GROUP_GAP (next_stmt_info))
6437         {
6438           gap_count++;
6439           continue;
6440         }
6441
6442       /* ???  The following needs cleanup after the removal of
6443          DR_GROUP_SAME_DR_STMT.  */
6444       if (next_stmt_info)
6445         {
6446           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6447           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6448              copies, and we put the new vector statement in the first available
6449              RELATED_STMT.  */
6450           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6451             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6452           else
6453             {
6454               stmt_vec_info prev_stmt_info
6455                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6456               stmt_vec_info rel_stmt_info
6457                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6458               while (rel_stmt_info)
6459                 {
6460                   prev_stmt_info = rel_stmt_info;
6461                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6462                 }
6463
6464               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6465             }
6466
6467           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6468           gap_count = 1;
6469         }
6470     }
6471 }
6472
6473 /* Function vect_force_dr_alignment_p.
6474
6475    Returns whether the alignment of a DECL can be forced to be aligned
6476    on ALIGNMENT bit boundary.  */
6477
6478 bool
6479 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6480 {
6481   if (!VAR_P (decl))
6482     return false;
6483
6484   if (decl_in_symtab_p (decl)
6485       && !symtab_node::get (decl)->can_increase_alignment_p ())
6486     return false;
6487
6488   if (TREE_STATIC (decl))
6489     return (known_le (alignment,
6490                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6491   else
6492     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6493 }
6494
6495
6496 /* Return whether the data reference DR_INFO is supported with respect to its
6497    alignment.
6498    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6499    it is aligned, i.e., check if it is possible to vectorize it with different
6500    alignment.  */
6501
6502 enum dr_alignment_support
6503 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6504                                bool check_aligned_accesses)
6505 {
6506   data_reference *dr = dr_info->dr;
6507   stmt_vec_info stmt_info = dr_info->stmt;
6508   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6509   machine_mode mode = TYPE_MODE (vectype);
6510   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6511   class loop *vect_loop = NULL;
6512   bool nested_in_vect_loop = false;
6513
6514   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6515     return dr_aligned;
6516
6517   /* For now assume all conditional loads/stores support unaligned
6518      access without any special code.  */
6519   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6520     if (gimple_call_internal_p (stmt)
6521         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6522             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6523       return dr_unaligned_supported;
6524
6525   if (loop_vinfo)
6526     {
6527       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6528       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6529     }
6530
6531   /* Possibly unaligned access.  */
6532
6533   /* We can choose between using the implicit realignment scheme (generating
6534      a misaligned_move stmt) and the explicit realignment scheme (generating
6535      aligned loads with a REALIGN_LOAD).  There are two variants to the
6536      explicit realignment scheme: optimized, and unoptimized.
6537      We can optimize the realignment only if the step between consecutive
6538      vector loads is equal to the vector size.  Since the vector memory
6539      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6540      is guaranteed that the misalignment amount remains the same throughout the
6541      execution of the vectorized loop.  Therefore, we can create the
6542      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6543      at the loop preheader.
6544
6545      However, in the case of outer-loop vectorization, when vectorizing a
6546      memory access in the inner-loop nested within the LOOP that is now being
6547      vectorized, while it is guaranteed that the misalignment of the
6548      vectorized memory access will remain the same in different outer-loop
6549      iterations, it is *not* guaranteed that is will remain the same throughout
6550      the execution of the inner-loop.  This is because the inner-loop advances
6551      with the original scalar step (and not in steps of VS).  If the inner-loop
6552      step happens to be a multiple of VS, then the misalignment remains fixed
6553      and we can use the optimized realignment scheme.  For example:
6554
6555       for (i=0; i<N; i++)
6556         for (j=0; j<M; j++)
6557           s += a[i+j];
6558
6559      When vectorizing the i-loop in the above example, the step between
6560      consecutive vector loads is 1, and so the misalignment does not remain
6561      fixed across the execution of the inner-loop, and the realignment cannot
6562      be optimized (as illustrated in the following pseudo vectorized loop):
6563
6564       for (i=0; i<N; i+=4)
6565         for (j=0; j<M; j++){
6566           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6567                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6568                          // (assuming that we start from an aligned address).
6569           }
6570
6571      We therefore have to use the unoptimized realignment scheme:
6572
6573       for (i=0; i<N; i+=4)
6574           for (j=k; j<M; j+=4)
6575           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6576                            // that the misalignment of the initial address is
6577                            // 0).
6578
6579      The loop can then be vectorized as follows:
6580
6581       for (k=0; k<4; k++){
6582         rt = get_realignment_token (&vp[k]);
6583         for (i=0; i<N; i+=4){
6584           v1 = vp[i+k];
6585           for (j=k; j<M; j+=4){
6586             v2 = vp[i+j+VS-1];
6587             va = REALIGN_LOAD <v1,v2,rt>;
6588             vs += va;
6589             v1 = v2;
6590           }
6591         }
6592     } */
6593
6594   if (DR_IS_READ (dr))
6595     {
6596       bool is_packed = false;
6597       tree type = (TREE_TYPE (DR_REF (dr)));
6598
6599       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6600           && (!targetm.vectorize.builtin_mask_for_load
6601               || targetm.vectorize.builtin_mask_for_load ()))
6602         {
6603           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6604
6605           /* If we are doing SLP then the accesses need not have the
6606              same alignment, instead it depends on the SLP group size.  */
6607           if (loop_vinfo
6608               && STMT_SLP_TYPE (stmt_info)
6609               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6610                               * (DR_GROUP_SIZE
6611                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6612                               TYPE_VECTOR_SUBPARTS (vectype)))
6613             ;
6614           else if (!loop_vinfo
6615                    || (nested_in_vect_loop
6616                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6617                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6618             return dr_explicit_realign;
6619           else
6620             return dr_explicit_realign_optimized;
6621         }
6622       if (!known_alignment_for_access_p (dr_info))
6623         is_packed = not_size_aligned (DR_REF (dr));
6624
6625       if (targetm.vectorize.support_vector_misalignment
6626             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6627         /* Can't software pipeline the loads, but can at least do them.  */
6628         return dr_unaligned_supported;
6629     }
6630   else
6631     {
6632       bool is_packed = false;
6633       tree type = (TREE_TYPE (DR_REF (dr)));
6634
6635       if (!known_alignment_for_access_p (dr_info))
6636         is_packed = not_size_aligned (DR_REF (dr));
6637
6638      if (targetm.vectorize.support_vector_misalignment
6639            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6640        return dr_unaligned_supported;
6641     }
6642
6643   /* Unsupported.  */
6644   return dr_unaligned_unsupported;
6645 }