gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 236      emitted at the position of the first scalar load.
 237      Stores in a group are emitted at the position of the last scalar store.
 238      Compute that position and check whether the resulting order matches
 239      the current one.  */
 240   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 241   if (il_a)
 242     {
 243       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 244         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 245              s = DR_GROUP_NEXT_ELEMENT (s))
 246           il_a = get_later_stmt (il_a, s);
 247       else /* DR_IS_READ */
 248         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 249              s = DR_GROUP_NEXT_ELEMENT (s))
 250           if (get_later_stmt (il_a, s) == il_a)
 251             il_a = s;
 252     }
 253   else
 254     il_a = stmtinfo_a;
 255   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 256   if (il_b)
 257     {
 258       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 259         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 260              s = DR_GROUP_NEXT_ELEMENT (s))
 261           il_b = get_later_stmt (il_b, s);
 262       else /* DR_IS_READ */
 263         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 264              s = DR_GROUP_NEXT_ELEMENT (s))
 265           if (get_later_stmt (il_b, s) == il_b)
 266             il_b = s;
 267     }
 268   else
 269     il_b = stmtinfo_b;
 270   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 271   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 272 }
 273
 274 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 275    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 276    distances.  These distances are conservatively correct but they don't
 277    reflect a guaranteed dependence.
 278
 279    Return true if this function does all the work necessary to avoid
 280    an alias or false if the caller should use the dependence distances
 281    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 282    the depth of the loop described by LOOP_VINFO and the other arguments
 283    are as for vect_analyze_data_ref_dependence.  */
 284
 285 static bool
 286 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 287                                        loop_vec_info loop_vinfo,
 288                                        int loop_depth, unsigned int *max_vf)
 289 {
 290   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 291   lambda_vector dist_v;
 292   unsigned int i;
 293   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 294     {
 295       int dist = dist_v[loop_depth];
 296       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 297         {
 298           /* If the user asserted safelen >= DIST consecutive iterations
 299              can be executed concurrently, assume independence.
 300
 301              ??? An alternative would be to add the alias check even
 302              in this case, and vectorize the fallback loop with the
 303              maximum VF set to safelen.  However, if the user has
 304              explicitly given a length, it's less likely that that
 305              would be a win.  */
 306           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 307             {
 308               if ((unsigned int) loop->safelen < *max_vf)
 309                 *max_vf = loop->safelen;
 310               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 311               continue;
 312             }
 313
 314           /* For dependence distances of 2 or more, we have the option
 315              of limiting VF or checking for an alias at runtime.
 316              Prefer to check at runtime if we can, to avoid limiting
 317              the VF unnecessarily when the bases are in fact independent.
 318
 319              Note that the alias checks will be removed if the VF ends up
 320              being small enough.  */
 321           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 322           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 323           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 324                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 325                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 326         }
 327     }
 328   return true;
 329 }
 330
 331
 332 /* Function vect_analyze_data_ref_dependence.
 333
 334    FIXME: I needed to change the sense of the returned flag.
 335
 336    Return FALSE if there (might) exist a dependence between a memory-reference
 337    DRA and a memory-reference DRB.  When versioning for alias may check a
 338    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 339    the data dependence.  */
 340
 341 static opt_result
 342 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 343                                   loop_vec_info loop_vinfo,
 344                                   unsigned int *max_vf)
 345 {
 346   unsigned int i;
 347   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 348   struct data_reference *dra = DDR_A (ddr);
 349   struct data_reference *drb = DDR_B (ddr);
 350   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 351   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 352   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 353   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 354   lambda_vector dist_v;
 355   unsigned int loop_depth;
 356
 357   /* In loop analysis all data references should be vectorizable.  */
 358   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 359       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 360     gcc_unreachable ();
 361
 362   /* Independent data accesses.  */
 363   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 364     return opt_result::success ();
 365
 366   if (dra == drb
 367       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 368     return opt_result::success ();
 369
 370   /* We do not have to consider dependences between accesses that belong
 371      to the same group, unless the stride could be smaller than the
 372      group size.  */
 373   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 374       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 375           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 376       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 377     return opt_result::success ();
 378
 379   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 380      least two scalar iterations, there is always also a true dependence.
 381      As the vectorizer does not re-order loads and stores we can ignore
 382      the anti-dependence if TBAA can disambiguate both DRs similar to the
 383      case with known negative distance anti-dependences (positive
 384      distance anti-dependences would violate TBAA constraints).  */
 385   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 386        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 387       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 388                                  get_alias_set (DR_REF (drb))))
 389     return opt_result::success ();
 390
 391   /* Unknown data dependence.  */
 392   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 393     {
 394       /* If user asserted safelen consecutive iterations can be
 395          executed concurrently, assume independence.  */
 396       if (loop->safelen >= 2)
 397         {
 398           if ((unsigned int) loop->safelen < *max_vf)
 399             *max_vf = loop->safelen;
 400           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 401           return opt_result::success ();
 402         }
 403
 404       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 405           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 406         return opt_result::failure_at
 407           (stmtinfo_a->stmt,
 408            "versioning for alias not supported for: "
 409            "can't determine dependence between %T and %T\n",
 410            DR_REF (dra), DR_REF (drb));
 411
 412       if (dump_enabled_p ())
 413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 414                          "versioning for alias required: "
 415                          "can't determine dependence between %T and %T\n",
 416                          DR_REF (dra), DR_REF (drb));
 417
 418       /* Add to list of ddrs that need to be tested at run-time.  */
 419       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 420     }
 421
 422   /* Known data dependence.  */
 423   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 424     {
 425       /* If user asserted safelen consecutive iterations can be
 426          executed concurrently, assume independence.  */
 427       if (loop->safelen >= 2)
 428         {
 429           if ((unsigned int) loop->safelen < *max_vf)
 430             *max_vf = loop->safelen;
 431           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 432           return opt_result::success ();
 433         }
 434
 435       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 436           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 437         return opt_result::failure_at
 438           (stmtinfo_a->stmt,
 439            "versioning for alias not supported for: "
 440            "bad dist vector for %T and %T\n",
 441            DR_REF (dra), DR_REF (drb));
 442
 443       if (dump_enabled_p ())
 444         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 445                          "versioning for alias required: "
 446                          "bad dist vector for %T and %T\n",
 447                          DR_REF (dra), DR_REF (drb));
 448       /* Add to list of ddrs that need to be tested at run-time.  */
 449       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 450     }
 451
 452   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 453
 454   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 455       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 456                                                 loop_depth, max_vf))
 457     return opt_result::success ();
 458
 459   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 460     {
 461       int dist = dist_v[loop_depth];
 462
 463       if (dump_enabled_p ())
 464         dump_printf_loc (MSG_NOTE, vect_location,
 465                          "dependence distance  = %d.\n", dist);
 466
 467       if (dist == 0)
 468         {
 469           if (dump_enabled_p ())
 470             dump_printf_loc (MSG_NOTE, vect_location,
 471                              "dependence distance == 0 between %T and %T\n",
 472                              DR_REF (dra), DR_REF (drb));
 473
 474           /* When we perform grouped accesses and perform implicit CSE
 475              by detecting equal accesses and doing disambiguation with
 476              runtime alias tests like for
 477                 .. = a[i];
 478                 .. = a[i+1];
 479                 a[i] = ..;
 480                 a[i+1] = ..;
 481                 *p = ..;
 482                 .. = a[i];
 483                 .. = a[i+1];
 484              where we will end up loading { a[i], a[i+1] } once, make
 485              sure that inserting group loads before the first load and
 486              stores after the last store will do the right thing.
 487              Similar for groups like
 488                 a[i] = ...;
 489                 ... = a[i];
 490                 a[i+1] = ...;
 491              where loads from the group interleave with the store.  */
 492           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 493             return opt_result::failure_at (stmtinfo_a->stmt,
 494                                            "READ_WRITE dependence"
 495                                            " in interleaving.\n");
 496
 497           if (loop->safelen < 2)
 498             {
 499               tree indicator = dr_zero_step_indicator (dra);
 500               if (!indicator || integer_zerop (indicator))
 501                 return opt_result::failure_at (stmtinfo_a->stmt,
 502                                                "access also has a zero step\n");
 503               else if (TREE_CODE (indicator) != INTEGER_CST)
 504                 vect_check_nonzero_value (loop_vinfo, indicator);
 505             }
 506           continue;
 507         }
 508
 509       if (dist > 0 && DDR_REVERSED_P (ddr))
 510         {
 511           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 512              reversed (to make distance vector positive), and the actual
 513              distance is negative.  */
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location,
 516                              "dependence distance negative.\n");
 517           /* When doing outer loop vectorization, we need to check if there is
 518              a backward dependence at the inner loop level if the dependence
 519              at the outer loop is reversed.  See PR81740.  */
 520           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 521               || nested_in_vect_loop_p (loop, stmtinfo_b))
 522             {
 523               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 524                                                          DDR_LOOP_NEST (ddr));
 525               if (dist_v[inner_depth] < 0)
 526                 return opt_result::failure_at (stmtinfo_a->stmt,
 527                                                "not vectorized, dependence "
 528                                                "between data-refs %T and %T\n",
 529                                                DR_REF (dra), DR_REF (drb));
 530             }
 531           /* Record a negative dependence distance to later limit the
 532              amount of stmt copying / unrolling we can perform.
 533              Only need to handle read-after-write dependence.  */
 534           if (DR_IS_READ (drb)
 535               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 536                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 537             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 538           continue;
 539         }
 540
 541       unsigned int abs_dist = abs (dist);
 542       if (abs_dist >= 2 && abs_dist < *max_vf)
 543         {
 544           /* The dependence distance requires reduction of the maximal
 545              vectorization factor.  */
 546           *max_vf = abs_dist;
 547           if (dump_enabled_p ())
 548             dump_printf_loc (MSG_NOTE, vect_location,
 549                              "adjusting maximal vectorization factor to %i\n",
 550                              *max_vf);
 551         }
 552
 553       if (abs_dist >= *max_vf)
 554         {
 555           /* Dependence distance does not create dependence, as far as
 556              vectorization is concerned, in this case.  */
 557           if (dump_enabled_p ())
 558             dump_printf_loc (MSG_NOTE, vect_location,
 559                              "dependence distance >= VF.\n");
 560           continue;
 561         }
 562
 563       return opt_result::failure_at (stmtinfo_a->stmt,
 564                                      "not vectorized, possible dependence "
 565                                      "between data-refs %T and %T\n",
 566                                      DR_REF (dra), DR_REF (drb));
 567     }
 568
 569   return opt_result::success ();
 570 }
 571
 572 /* Function vect_analyze_data_ref_dependences.
 573
 574    Examine all the data references in the loop, and make sure there do not
 575    exist any data dependences between them.  Set *MAX_VF according to
 576    the maximum vectorization factor the data dependences allow.  */
 577
 578 opt_result
 579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 580                                    unsigned int *max_vf)
 581 {
 582   unsigned int i;
 583   struct data_dependence_relation *ddr;
 584
 585   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 586
 587   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 588     {
 589       LOOP_VINFO_DDRS (loop_vinfo)
 590         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 591                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 592       /* We do not need read-read dependences.  */
 593       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 594                                           &LOOP_VINFO_DDRS (loop_vinfo),
 595                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 596                                           false);
 597       gcc_assert (res);
 598     }
 599
 600   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 601
 602   /* For epilogues we either have no aliases or alias versioning
 603      was applied to original loop.  Therefore we may just get max_vf
 604      using VF of original loop.  */
 605   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 606     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 607   else
 608     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 609       {
 610         opt_result res
 611           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 612         if (!res)
 613           return res;
 614       }
 615
 616   return opt_result::success ();
 617 }
 618
 619
 620 /* Function vect_slp_analyze_data_ref_dependence.
 621
 622    Return TRUE if there (might) exist a dependence between a memory-reference
 623    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 624    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 625    according to the data dependence.  */
 626
 627 static bool
 628 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 629                                       struct data_dependence_relation *ddr)
 630 {
 631   struct data_reference *dra = DDR_A (ddr);
 632   struct data_reference *drb = DDR_B (ddr);
 633   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 634   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 635
 636   /* We need to check dependences of statements marked as unvectorizable
 637      as well, they still can prohibit vectorization.  */
 638
 639   /* Independent data accesses.  */
 640   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 641     return false;
 642
 643   if (dra == drb)
 644     return false;
 645
 646   /* Read-read is OK.  */
 647   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 648     return false;
 649
 650   /* If dra and drb are part of the same interleaving chain consider
 651      them independent.  */
 652   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 653       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 654           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 655     return false;
 656
 657   /* Unknown data dependence.  */
 658   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 659     {
 660       if  (dump_enabled_p ())
 661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 662                          "can't determine dependence between %T and %T\n",
 663                          DR_REF (dra), DR_REF (drb));
 664     }
 665   else if (dump_enabled_p ())
 666     dump_printf_loc (MSG_NOTE, vect_location,
 667                      "determined dependence between %T and %T\n",
 668                      DR_REF (dra), DR_REF (drb));
 669
 670   return true;
 671 }
 672
 673
 674 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 675    contain the vector of scalar stores of this instance if we are
 676    disambiguating the loads.  */
 677
 678 static bool
 679 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 680                                    vec<stmt_vec_info> stores,
 681                                    stmt_vec_info last_store_info)
 682 {
 683   /* This walks over all stmts involved in the SLP load/store done
 684      in NODE verifying we can sink them up to the last stmt in the
 685      group.  */
 686   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 687     {
 688       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 689       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 690         {
 691           stmt_vec_info access_info
 692             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 693           if (access_info == last_access_info)
 694             continue;
 695           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 696           ao_ref ref;
 697           bool ref_initialized_p = false;
 698           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 699                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 700             {
 701               gimple *stmt = gsi_stmt (gsi);
 702               if (! gimple_vuse (stmt))
 703                 continue;
 704
 705               /* If we couldn't record a (single) data reference for this
 706                  stmt we have to resort to the alias oracle.  */
 707               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 708               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 709               if (!dr_b)
 710                 {
 711                   /* We are moving a store - this means
 712                      we cannot use TBAA for disambiguation.  */
 713                   if (!ref_initialized_p)
 714                     ao_ref_init (&ref, DR_REF (dr_a));
 715                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 716                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 717                     return false;
 718                   continue;
 719                 }
 720
 721               bool dependent = false;
 722               /* If we run into a store of this same instance (we've just
 723                  marked those) then delay dependence checking until we run
 724                  into the last store because this is where it will have
 725                  been sunk to (and we verify if we can do that as well).  */
 726               if (gimple_visited_p (stmt))
 727                 {
 728                   if (stmt_info != last_store_info)
 729                     continue;
 730                   unsigned i;
 731                   stmt_vec_info store_info;
 732                   FOR_EACH_VEC_ELT (stores, i, store_info)
 733                     {
 734                       data_reference *store_dr
 735                         = STMT_VINFO_DATA_REF (store_info);
 736                       ddr_p ddr = initialize_data_dependence_relation
 737                                     (dr_a, store_dr, vNULL);
 738                       dependent
 739                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740                       free_dependence_relation (ddr);
 741                       if (dependent)
 742                         break;
 743                     }
 744                 }
 745               else
 746                 {
 747                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 748                                                                    dr_b, vNULL);
 749                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 750                   free_dependence_relation (ddr);
 751                 }
 752               if (dependent)
 753                 return false;
 754             }
 755         }
 756     }
 757   else /* DR_IS_READ */
 758     {
 759       stmt_vec_info first_access_info
 760         = vect_find_first_scalar_stmt_in_slp (node);
 761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 762         {
 763           stmt_vec_info access_info
 764             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 765           if (access_info == first_access_info)
 766             continue;
 767           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 768           ao_ref ref;
 769           bool ref_initialized_p = false;
 770           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 771                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 772             {
 773               gimple *stmt = gsi_stmt (gsi);
 774               if (! gimple_vdef (stmt))
 775                 continue;
 776
 777               /* If we couldn't record a (single) data reference for this
 778                  stmt we have to resort to the alias oracle.  */
 779               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 780               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 781               if (!dr_b)
 782                 {
 783                   /* We are hoisting a load - this means we can use
 784                      TBAA for disambiguation.  */
 785                   if (!ref_initialized_p)
 786                     ao_ref_init (&ref, DR_REF (dr_a));
 787                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 788                     return false;
 789                   continue;
 790                 }
 791
 792               bool dependent = false;
 793               /* If we run into a store of this same instance (we've just
 794                  marked those) then delay dependence checking until we run
 795                  into the last store because this is where it will have
 796                  been sunk to (and we verify if we can do that as well).  */
 797               if (gimple_visited_p (stmt))
 798                 {
 799                   if (stmt_info != last_store_info)
 800                     continue;
 801                   unsigned i;
 802                   stmt_vec_info store_info;
 803                   FOR_EACH_VEC_ELT (stores, i, store_info)
 804                     {
 805                       data_reference *store_dr
 806                         = STMT_VINFO_DATA_REF (store_info);
 807                       ddr_p ddr = initialize_data_dependence_relation
 808                                     (dr_a, store_dr, vNULL);
 809                       dependent
 810                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 811                       free_dependence_relation (ddr);
 812                       if (dependent)
 813                         break;
 814                     }
 815                 }
 816               else
 817                 {
 818                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 819                                                                    dr_b, vNULL);
 820                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 821                   free_dependence_relation (ddr);
 822                 }
 823               if (dependent)
 824                 return false;
 825             }
 826         }
 827     }
 828   return true;
 829 }
 830
 831
 832 /* Function vect_analyze_data_ref_dependences.
 833
 834    Examine all the data references in the basic-block, and make sure there
 835    do not exist any data dependences between them.  Set *MAX_VF according to
 836    the maximum vectorization factor the data dependences allow.  */
 837
 838 bool
 839 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 840 {
 841   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 842
 843   /* The stores of this instance are at the root of the SLP tree.  */
 844   slp_tree store = SLP_INSTANCE_TREE (instance);
 845   if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
 846     store = NULL;
 847
 848   /* Verify we can sink stores to the vectorized stmt insert location.  */
 849   stmt_vec_info last_store_info = NULL;
 850   if (store)
 851     {
 852       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 853         return false;
 854
 855       /* Mark stores in this instance and remember the last one.  */
 856       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 857       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 858         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 859     }
 860
 861   bool res = true;
 862
 863   /* Verify we can sink loads to the vectorized stmt insert location,
 864      special-casing stores of this instance.  */
 865   slp_tree load;
 866   unsigned int i;
 867   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 868     if (! vect_slp_analyze_node_dependences (vinfo, load,
 869                                              store
 870                                              ? SLP_TREE_SCALAR_STMTS (store)
 871                                              : vNULL, last_store_info))
 872       {
 873         res = false;
 874         break;
 875       }
 876
 877   /* Unset the visited flag.  */
 878   if (store)
 879     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 880       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 881
 882   return res;
 883 }
 884
 885 /* Record the base alignment guarantee given by DRB, which occurs
 886    in STMT_INFO.  */
 887
 888 static void
 889 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 890                             innermost_loop_behavior *drb)
 891 {
 892   bool existed;
 893   innermost_loop_behavior *&entry
 894     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 895   if (!existed || entry->base_alignment < drb->base_alignment)
 896     {
 897       entry = drb;
 898       if (dump_enabled_p ())
 899         dump_printf_loc (MSG_NOTE, vect_location,
 900                          "recording new base alignment for %T\n"
 901                          "  alignment:    %d\n"
 902                          "  misalignment: %d\n"
 903                          "  based on:     %G",
 904                          drb->base_address,
 905                          drb->base_alignment,
 906                          drb->base_misalignment,
 907                          stmt_info->stmt);
 908     }
 909 }
 910
 911 /* If the region we're going to vectorize is reached, all unconditional
 912    data references occur at least once.  We can therefore pool the base
 913    alignment guarantees from each unconditional reference.  Do this by
 914    going through all the data references in VINFO and checking whether
 915    the containing statement makes the reference unconditionally.  If so,
 916    record the alignment of the base address in VINFO so that it can be
 917    used for all other references with the same base.  */
 918
 919 void
 920 vect_record_base_alignments (vec_info *vinfo)
 921 {
 922   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 923   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 924   data_reference *dr;
 925   unsigned int i;
 926   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 927     {
 928       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 929       stmt_vec_info stmt_info = dr_info->stmt;
 930       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 931           && STMT_VINFO_VECTORIZABLE (stmt_info)
 932           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 933         {
 934           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 935
 936           /* If DR is nested in the loop that is being vectorized, we can also
 937              record the alignment of the base wrt the outer loop.  */
 938           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 939             vect_record_base_alignment
 940               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 941         }
 942     }
 943 }
 944
 945 /* Return the target alignment for the vectorized form of DR_INFO.  */
 946
 947 static poly_uint64
 948 vect_calculate_target_alignment (dr_vec_info *dr_info)
 949 {
 950   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 951   return targetm.vectorize.preferred_vector_alignment (vectype);
 952 }
 953
 954 /* Function vect_compute_data_ref_alignment
 955
 956    Compute the misalignment of the data reference DR_INFO.
 957
 958    Output:
 959    1. DR_MISALIGNMENT (DR_INFO) is defined.
 960
 961    FOR NOW: No analysis is actually performed. Misalignment is calculated
 962    only for trivial cases. TODO.  */
 963
 964 static void
 965 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 966 {
 967   stmt_vec_info stmt_info = dr_info->stmt;
 968   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 969   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 970   class loop *loop = NULL;
 971   tree ref = DR_REF (dr_info->dr);
 972   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 973
 974   if (dump_enabled_p ())
 975     dump_printf_loc (MSG_NOTE, vect_location,
 976                      "vect_compute_data_ref_alignment:\n");
 977
 978   if (loop_vinfo)
 979     loop = LOOP_VINFO_LOOP (loop_vinfo);
 980
 981   /* Initialize misalignment to unknown.  */
 982   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 983
 984   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 985     return;
 986
 987   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 988   bool step_preserves_misalignment_p;
 989
 990   poly_uint64 vector_alignment
 991     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 992   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 993
 994   /* If the main loop has peeled for alignment we have no way of knowing
 995      whether the data accesses in the epilogues are aligned.  We can't at
 996      compile time answer the question whether we have entered the main loop or
 997      not.  Fixes PR 92351.  */
 998   if (loop_vinfo)
 999     {
1000       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1001       if (orig_loop_vinfo
1002           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1003         return;
1004     }
1005
1006   unsigned HOST_WIDE_INT vect_align_c;
1007   if (!vector_alignment.is_constant (&vect_align_c))
1008     return;
1009
1010   /* No step for BB vectorization.  */
1011   if (!loop)
1012     {
1013       gcc_assert (integer_zerop (drb->step));
1014       step_preserves_misalignment_p = true;
1015     }
1016
1017   /* In case the dataref is in an inner-loop of the loop that is being
1018      vectorized (LOOP), we use the base and misalignment information
1019      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1020      stays the same throughout the execution of the inner-loop, which is why
1021      we have to check that the stride of the dataref in the inner-loop evenly
1022      divides by the vector alignment.  */
1023   else if (nested_in_vect_loop_p (loop, stmt_info))
1024     {
1025       step_preserves_misalignment_p
1026         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1027
1028       if (dump_enabled_p ())
1029         {
1030           if (step_preserves_misalignment_p)
1031             dump_printf_loc (MSG_NOTE, vect_location,
1032                              "inner step divides the vector alignment.\n");
1033           else
1034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                              "inner step doesn't divide the vector"
1036                              " alignment.\n");
1037         }
1038     }
1039
1040   /* Similarly we can only use base and misalignment information relative to
1041      an innermost loop if the misalignment stays the same throughout the
1042      execution of the loop.  As above, this is the case if the stride of
1043      the dataref evenly divides by the alignment.  */
1044   else
1045     {
1046       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1047       step_preserves_misalignment_p
1048         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1049
1050       if (!step_preserves_misalignment_p && dump_enabled_p ())
1051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052                          "step doesn't divide the vector alignment.\n");
1053     }
1054
1055   unsigned int base_alignment = drb->base_alignment;
1056   unsigned int base_misalignment = drb->base_misalignment;
1057
1058   /* Calculate the maximum of the pooled base address alignment and the
1059      alignment that we can compute for DR itself.  */
1060   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1061   if (entry && base_alignment < (*entry)->base_alignment)
1062     {
1063       base_alignment = (*entry)->base_alignment;
1064       base_misalignment = (*entry)->base_misalignment;
1065     }
1066
1067   if (drb->offset_alignment < vect_align_c
1068       || !step_preserves_misalignment_p
1069       /* We need to know whether the step wrt the vectorized loop is
1070          negative when computing the starting misalignment below.  */
1071       || TREE_CODE (drb->step) != INTEGER_CST)
1072     {
1073       if (dump_enabled_p ())
1074         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1075                          "Unknown alignment for access: %T\n", ref);
1076       return;
1077     }
1078
1079   if (base_alignment < vect_align_c)
1080     {
1081       unsigned int max_alignment;
1082       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1083       if (max_alignment < vect_align_c
1084           || !vect_can_force_dr_alignment_p (base,
1085                                              vect_align_c * BITS_PER_UNIT))
1086         {
1087           if (dump_enabled_p ())
1088             dump_printf_loc (MSG_NOTE, vect_location,
1089                              "can't force alignment of ref: %T\n", ref);
1090           return;
1091         }
1092
1093       /* Force the alignment of the decl.
1094          NOTE: This is the only change to the code we make during
1095          the analysis phase, before deciding to vectorize the loop.  */
1096       if (dump_enabled_p ())
1097         dump_printf_loc (MSG_NOTE, vect_location,
1098                          "force alignment of %T\n", ref);
1099
1100       dr_info->base_decl = base;
1101       dr_info->base_misaligned = true;
1102       base_misalignment = 0;
1103     }
1104   poly_int64 misalignment
1105     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1106
1107   /* If this is a backward running DR then first access in the larger
1108      vectype actually is N-1 elements before the address in the DR.
1109      Adjust misalign accordingly.  */
1110   if (tree_int_cst_sgn (drb->step) < 0)
1111     /* PLUS because STEP is negative.  */
1112     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1113                      * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1114
1115   unsigned int const_misalignment;
1116   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1117     {
1118       if (dump_enabled_p ())
1119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120                          "Non-constant misalignment for access: %T\n", ref);
1121       return;
1122     }
1123
1124   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1125
1126   if (dump_enabled_p ())
1127     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128                      "misalign = %d bytes of ref %T\n",
1129                      DR_MISALIGNMENT (dr_info), ref);
1130
1131   return;
1132 }
1133
1134 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1135    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1136    is made aligned via peeling.  */
1137
1138 static bool
1139 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1140                                          dr_vec_info *dr_peel_info)
1141 {
1142   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1143                   DR_TARGET_ALIGNMENT (dr_info)))
1144     {
1145       poly_offset_int diff
1146         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1147            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1148       if (known_eq (diff, 0)
1149           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1150         return true;
1151     }
1152   return false;
1153 }
1154
1155 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1156    aligned via peeling.  */
1157
1158 static bool
1159 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1160                                  dr_vec_info *dr_peel_info)
1161 {
1162   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1163                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1164       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1165                            DR_OFFSET (dr_peel_info->dr), 0)
1166       || !operand_equal_p (DR_STEP (dr_info->dr),
1167                            DR_STEP (dr_peel_info->dr), 0))
1168     return false;
1169
1170   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1171 }
1172
1173 /* Function vect_update_misalignment_for_peel.
1174    Sets DR_INFO's misalignment
1175    - to 0 if it has the same alignment as DR_PEEL_INFO,
1176    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1177    - to -1 (unknown) otherwise.
1178
1179    DR_INFO - the data reference whose misalignment is to be adjusted.
1180    DR_PEEL_INFO - the data reference whose misalignment is being made
1181                   zero in the vector loop by the peel.
1182    NPEEL - the number of iterations in the peel loop if the misalignment
1183            of DR_PEEL_INFO is known at compile time.  */
1184
1185 static void
1186 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1187                                    dr_vec_info *dr_peel_info, int npeel)
1188 {
1189   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1190   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1191     {
1192       SET_DR_MISALIGNMENT (dr_info, 0);
1193       return;
1194     }
1195
1196   unsigned HOST_WIDE_INT alignment;
1197   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1198       && known_alignment_for_access_p (dr_info)
1199       && known_alignment_for_access_p (dr_peel_info))
1200     {
1201       int misal = DR_MISALIGNMENT (dr_info);
1202       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1203       misal &= alignment - 1;
1204       SET_DR_MISALIGNMENT (dr_info, misal);
1205       return;
1206     }
1207
1208   if (dump_enabled_p ())
1209     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1210                      "to unknown (-1).\n");
1211   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1212 }
1213
1214 /* Return true if alignment is relevant for DR_INFO.  */
1215
1216 static bool
1217 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220
1221   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1222     return false;
1223
1224   /* For interleaving, only the alignment of the first access matters.  */
1225   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1226       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1227     return false;
1228
1229   /* Scatter-gather and invariant accesses continue to address individual
1230      scalars, so vector-level alignment is irrelevant.  */
1231   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1232       || integer_zerop (DR_STEP (dr_info->dr)))
1233     return false;
1234
1235   /* Strided accesses perform only component accesses, alignment is
1236      irrelevant for them.  */
1237   if (STMT_VINFO_STRIDED_P (stmt_info)
1238       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1239     return false;
1240
1241   return true;
1242 }
1243
1244 /* Given an memory reference EXP return whether its alignment is less
1245    than its size.  */
1246
1247 static bool
1248 not_size_aligned (tree exp)
1249 {
1250   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1251     return true;
1252
1253   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1254           > get_object_alignment (exp));
1255 }
1256
1257 /* Function vector_alignment_reachable_p
1258
1259    Return true if vector alignment for DR_INFO is reachable by peeling
1260    a few loop iterations.  Return false otherwise.  */
1261
1262 static bool
1263 vector_alignment_reachable_p (dr_vec_info *dr_info)
1264 {
1265   stmt_vec_info stmt_info = dr_info->stmt;
1266   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1267
1268   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1269     {
1270       /* For interleaved access we peel only if number of iterations in
1271          the prolog loop ({VF - misalignment}), is a multiple of the
1272          number of the interleaved accesses.  */
1273       int elem_size, mis_in_elements;
1274
1275       /* FORNOW: handle only known alignment.  */
1276       if (!known_alignment_for_access_p (dr_info))
1277         return false;
1278
1279       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1280       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1281       elem_size = vector_element_size (vector_size, nelements);
1282       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1283
1284       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1285         return false;
1286     }
1287
1288   /* If misalignment is known at the compile time then allow peeling
1289      only if natural alignment is reachable through peeling.  */
1290   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1291     {
1292       HOST_WIDE_INT elmsize =
1293                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1294       if (dump_enabled_p ())
1295         {
1296           dump_printf_loc (MSG_NOTE, vect_location,
1297                            "data size = %wd. misalignment = %d.\n", elmsize,
1298                            DR_MISALIGNMENT (dr_info));
1299         }
1300       if (DR_MISALIGNMENT (dr_info) % elmsize)
1301         {
1302           if (dump_enabled_p ())
1303             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1304                              "data size does not divide the misalignment.\n");
1305           return false;
1306         }
1307     }
1308
1309   if (!known_alignment_for_access_p (dr_info))
1310     {
1311       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1312       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "Unknown misalignment, %snaturally aligned\n",
1316                          is_packed ? "not " : "");
1317       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1318     }
1319
1320   return true;
1321 }
1322
1323
1324 /* Calculate the cost of the memory access represented by DR_INFO.  */
1325
1326 static void
1327 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1328                            unsigned int *inside_cost,
1329                            unsigned int *outside_cost,
1330                            stmt_vector_for_cost *body_cost_vec,
1331                            stmt_vector_for_cost *prologue_cost_vec)
1332 {
1333   stmt_vec_info stmt_info = dr_info->stmt;
1334   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1335   int ncopies;
1336
1337   if (PURE_SLP_STMT (stmt_info))
1338     ncopies = 1;
1339   else
1340     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1341
1342   if (DR_IS_READ (dr_info->dr))
1343     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1344                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1345   else
1346     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1347
1348   if (dump_enabled_p ())
1349     dump_printf_loc (MSG_NOTE, vect_location,
1350                      "vect_get_data_access_cost: inside_cost = %d, "
1351                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1352 }
1353
1354
1355 typedef struct _vect_peel_info
1356 {
1357   dr_vec_info *dr_info;
1358   int npeel;
1359   unsigned int count;
1360 } *vect_peel_info;
1361
1362 typedef struct _vect_peel_extended_info
1363 {
1364   vec_info *vinfo;
1365   struct _vect_peel_info peel_info;
1366   unsigned int inside_cost;
1367   unsigned int outside_cost;
1368 } *vect_peel_extended_info;
1369
1370
1371 /* Peeling hashtable helpers.  */
1372
1373 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1374 {
1375   static inline hashval_t hash (const _vect_peel_info *);
1376   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1377 };
1378
1379 inline hashval_t
1380 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1381 {
1382   return (hashval_t) peel_info->npeel;
1383 }
1384
1385 inline bool
1386 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1387 {
1388   return (a->npeel == b->npeel);
1389 }
1390
1391
1392 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1393
1394 static void
1395 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1396                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1397                           int npeel)
1398 {
1399   struct _vect_peel_info elem, *slot;
1400   _vect_peel_info **new_slot;
1401   bool supportable_dr_alignment
1402     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1403
1404   elem.npeel = npeel;
1405   slot = peeling_htab->find (&elem);
1406   if (slot)
1407     slot->count++;
1408   else
1409     {
1410       slot = XNEW (struct _vect_peel_info);
1411       slot->npeel = npeel;
1412       slot->dr_info = dr_info;
1413       slot->count = 1;
1414       new_slot = peeling_htab->find_slot (slot, INSERT);
1415       *new_slot = slot;
1416     }
1417
1418   if (!supportable_dr_alignment
1419       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1420     slot->count += VECT_MAX_COST;
1421 }
1422
1423
1424 /* Traverse peeling hash table to find peeling option that aligns maximum
1425    number of data accesses.  */
1426
1427 int
1428 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1429                                      _vect_peel_extended_info *max)
1430 {
1431   vect_peel_info elem = *slot;
1432
1433   if (elem->count > max->peel_info.count
1434       || (elem->count == max->peel_info.count
1435           && max->peel_info.npeel > elem->npeel))
1436     {
1437       max->peel_info.npeel = elem->npeel;
1438       max->peel_info.count = elem->count;
1439       max->peel_info.dr_info = elem->dr_info;
1440     }
1441
1442   return 1;
1443 }
1444
1445 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1446    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1447    we assume DR0_INFO's misalignment will be zero after peeling.  */
1448
1449 static void
1450 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1451                                 dr_vec_info *dr0_info,
1452                                 unsigned int *inside_cost,
1453                                 unsigned int *outside_cost,
1454                                 stmt_vector_for_cost *body_cost_vec,
1455                                 stmt_vector_for_cost *prologue_cost_vec,
1456                                 unsigned int npeel,
1457                                 bool unknown_misalignment)
1458 {
1459   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1460   unsigned i;
1461   data_reference *dr;
1462
1463   FOR_EACH_VEC_ELT (datarefs, i, dr)
1464     {
1465       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1466       if (!vect_relevant_for_alignment_p (dr_info))
1467         continue;
1468
1469       int save_misalignment;
1470       save_misalignment = DR_MISALIGNMENT (dr_info);
1471       if (npeel == 0)
1472         ;
1473       else if (unknown_misalignment && dr_info == dr0_info)
1474         SET_DR_MISALIGNMENT (dr_info, 0);
1475       else
1476         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1477       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1478                                  body_cost_vec, prologue_cost_vec);
1479       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1480     }
1481 }
1482
1483 /* Traverse peeling hash table and calculate cost for each peeling option.
1484    Find the one with the lowest cost.  */
1485
1486 int
1487 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1488                                    _vect_peel_extended_info *min)
1489 {
1490   vect_peel_info elem = *slot;
1491   int dummy;
1492   unsigned int inside_cost = 0, outside_cost = 0;
1493   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1494   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1495                        epilogue_cost_vec;
1496
1497   prologue_cost_vec.create (2);
1498   body_cost_vec.create (2);
1499   epilogue_cost_vec.create (2);
1500
1501   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1502                                   &outside_cost, &body_cost_vec,
1503                                   &prologue_cost_vec, elem->npeel, false);
1504
1505   body_cost_vec.release ();
1506
1507   outside_cost += vect_get_known_peeling_cost
1508     (loop_vinfo, elem->npeel, &dummy,
1509      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1510      &prologue_cost_vec, &epilogue_cost_vec);
1511
1512   /* Prologue and epilogue costs are added to the target model later.
1513      These costs depend only on the scalar iteration cost, the
1514      number of peeling iterations finally chosen, and the number of
1515      misaligned statements.  So discard the information found here.  */
1516   prologue_cost_vec.release ();
1517   epilogue_cost_vec.release ();
1518
1519   if (inside_cost < min->inside_cost
1520       || (inside_cost == min->inside_cost
1521           && outside_cost < min->outside_cost))
1522     {
1523       min->inside_cost = inside_cost;
1524       min->outside_cost = outside_cost;
1525       min->peel_info.dr_info = elem->dr_info;
1526       min->peel_info.npeel = elem->npeel;
1527       min->peel_info.count = elem->count;
1528     }
1529
1530   return 1;
1531 }
1532
1533
1534 /* Choose best peeling option by traversing peeling hash table and either
1535    choosing an option with the lowest cost (if cost model is enabled) or the
1536    option that aligns as many accesses as possible.  */
1537
1538 static struct _vect_peel_extended_info
1539 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1540                                        loop_vec_info loop_vinfo)
1541 {
1542    struct _vect_peel_extended_info res;
1543
1544    res.peel_info.dr_info = NULL;
1545    res.vinfo = loop_vinfo;
1546
1547    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1548      {
1549        res.inside_cost = INT_MAX;
1550        res.outside_cost = INT_MAX;
1551        peeling_htab->traverse <_vect_peel_extended_info *,
1552                                vect_peeling_hash_get_lowest_cost> (&res);
1553      }
1554    else
1555      {
1556        res.peel_info.count = 0;
1557        peeling_htab->traverse <_vect_peel_extended_info *,
1558                                vect_peeling_hash_get_most_frequent> (&res);
1559        res.inside_cost = 0;
1560        res.outside_cost = 0;
1561      }
1562
1563    return res;
1564 }
1565
1566 /* Return true if the new peeling NPEEL is supported.  */
1567
1568 static bool
1569 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1570                           unsigned npeel)
1571 {
1572   unsigned i;
1573   struct data_reference *dr = NULL;
1574   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1575   enum dr_alignment_support supportable_dr_alignment;
1576
1577   /* Ensure that all data refs can be vectorized after the peel.  */
1578   FOR_EACH_VEC_ELT (datarefs, i, dr)
1579     {
1580       int save_misalignment;
1581
1582       if (dr == dr0_info->dr)
1583         continue;
1584
1585       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1586       if (!vect_relevant_for_alignment_p (dr_info))
1587         continue;
1588
1589       save_misalignment = DR_MISALIGNMENT (dr_info);
1590       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1591       supportable_dr_alignment
1592         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1593       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1594
1595       if (!supportable_dr_alignment)
1596         return false;
1597     }
1598
1599   return true;
1600 }
1601
1602 /* Compare two data-references DRA and DRB to group them into chunks
1603    with related alignment.  */
1604
1605 static int
1606 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1607 {
1608   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1609   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1610   int cmp;
1611
1612   /* Stabilize sort.  */
1613   if (dra == drb)
1614     return 0;
1615
1616   /* Ordering of DRs according to base.  */
1617   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1618                                DR_BASE_ADDRESS (drb));
1619   if (cmp != 0)
1620     return cmp;
1621
1622   /* And according to DR_OFFSET.  */
1623   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1624   if (cmp != 0)
1625     return cmp;
1626
1627   /* And after step.  */
1628   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1629   if (cmp != 0)
1630     return cmp;
1631
1632   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1633   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1634   if (cmp == 0)
1635     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1636   return cmp;
1637 }
1638
1639 /* Function vect_enhance_data_refs_alignment
1640
1641    This pass will use loop versioning and loop peeling in order to enhance
1642    the alignment of data references in the loop.
1643
1644    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1645    original loop is to be vectorized.  Any other loops that are created by
1646    the transformations performed in this pass - are not supposed to be
1647    vectorized.  This restriction will be relaxed.
1648
1649    This pass will require a cost model to guide it whether to apply peeling
1650    or versioning or a combination of the two.  For example, the scheme that
1651    intel uses when given a loop with several memory accesses, is as follows:
1652    choose one memory access ('p') which alignment you want to force by doing
1653    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1654    other accesses are not necessarily aligned, or (2) use loop versioning to
1655    generate one loop in which all accesses are aligned, and another loop in
1656    which only 'p' is necessarily aligned.
1657
1658    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1659    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1660    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1661
1662    Devising a cost model is the most critical aspect of this work.  It will
1663    guide us on which access to peel for, whether to use loop versioning, how
1664    many versions to create, etc.  The cost model will probably consist of
1665    generic considerations as well as target specific considerations (on
1666    powerpc for example, misaligned stores are more painful than misaligned
1667    loads).
1668
1669    Here are the general steps involved in alignment enhancements:
1670
1671      -- original loop, before alignment analysis:
1672         for (i=0; i<N; i++){
1673           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1674           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1675         }
1676
1677      -- After vect_compute_data_refs_alignment:
1678         for (i=0; i<N; i++){
1679           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1680           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1681         }
1682
1683      -- Possibility 1: we do loop versioning:
1684      if (p is aligned) {
1685         for (i=0; i<N; i++){    # loop 1A
1686           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1687           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1688         }
1689      }
1690      else {
1691         for (i=0; i<N; i++){    # loop 1B
1692           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1693           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1694         }
1695      }
1696
1697      -- Possibility 2: we do loop peeling:
1698      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1699         x = q[i];
1700         p[i] = y;
1701      }
1702      for (i = 3; i < N; i++){   # loop 2A
1703         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1704         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1705      }
1706
1707      -- Possibility 3: combination of loop peeling and versioning:
1708      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1709         x = q[i];
1710         p[i] = y;
1711      }
1712      if (p is aligned) {
1713         for (i = 3; i<N; i++){  # loop 3A
1714           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1715           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1716         }
1717      }
1718      else {
1719         for (i = 3; i<N; i++){  # loop 3B
1720           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1721           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1722         }
1723      }
1724
1725      These loops are later passed to loop_transform to be vectorized.  The
1726      vectorizer will use the alignment information to guide the transformation
1727      (whether to generate regular loads/stores, or with special handling for
1728      misalignment).  */
1729
1730 opt_result
1731 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1732 {
1733   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1734   enum dr_alignment_support supportable_dr_alignment;
1735   dr_vec_info *first_store = NULL;
1736   dr_vec_info *dr0_info = NULL;
1737   struct data_reference *dr;
1738   unsigned int i;
1739   bool do_peeling = false;
1740   bool do_versioning = false;
1741   unsigned int npeel = 0;
1742   bool one_misalignment_known = false;
1743   bool one_misalignment_unknown = false;
1744   bool one_dr_unsupportable = false;
1745   dr_vec_info *unsupportable_dr_info = NULL;
1746   unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1747   hash_table<peel_info_hasher> peeling_htab (1);
1748
1749   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1750
1751   /* Reset data so we can safely be called multiple times.  */
1752   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1753   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1754
1755   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1756     return opt_result::success ();
1757
1758   /* Sort the vector of datarefs so DRs that have the same or dependent
1759      alignment are next to each other.  */
1760   auto_vec<data_reference_p> datarefs
1761     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1762   datarefs.qsort (dr_align_group_sort_cmp);
1763
1764   /* Compute the number of DRs that become aligned when we peel
1765      a dataref so it becomes aligned.  */
1766   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1767   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1768   unsigned i0;
1769   for (i0 = 0; i0 < datarefs.length (); ++i0)
1770     if (DR_BASE_ADDRESS (datarefs[i0]))
1771       break;
1772   for (i = i0 + 1; i <= datarefs.length (); ++i)
1773     {
1774       if (i == datarefs.length ()
1775           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1776                                DR_BASE_ADDRESS (datarefs[i]), 0)
1777           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1778                                DR_OFFSET (datarefs[i]), 0)
1779           || !operand_equal_p (DR_STEP (datarefs[i0]),
1780                                DR_STEP (datarefs[i]), 0))
1781         {
1782           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1783              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1784              will get known misalignment if we align one of the refs
1785              with the largest DR_TARGET_ALIGNMENT.  */
1786           for (unsigned j = i0; j < i; ++j)
1787             {
1788               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1789               for (unsigned k = i0; k < i; ++k)
1790                 {
1791                   if (k == j)
1792                     continue;
1793                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1794                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1795                                                                dr_infoj))
1796                     n_same_align_refs[j]++;
1797                 }
1798             }
1799           i0 = i;
1800         }
1801     }
1802
1803   /* While cost model enhancements are expected in the future, the high level
1804      view of the code at this time is as follows:
1805
1806      A) If there is a misaligned access then see if peeling to align
1807         this access can make all data references satisfy
1808         vect_supportable_dr_alignment.  If so, update data structures
1809         as needed and return true.
1810
1811      B) If peeling wasn't possible and there is a data reference with an
1812         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1813         then see if loop versioning checks can be used to make all data
1814         references satisfy vect_supportable_dr_alignment.  If so, update
1815         data structures as needed and return true.
1816
1817      C) If neither peeling nor versioning were successful then return false if
1818         any data reference does not satisfy vect_supportable_dr_alignment.
1819
1820      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1821
1822      Note, Possibility 3 above (which is peeling and versioning together) is not
1823      being done at this time.  */
1824
1825   /* (1) Peeling to force alignment.  */
1826
1827   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1828      Considerations:
1829      + How many accesses will become aligned due to the peeling
1830      - How many accesses will become unaligned due to the peeling,
1831        and the cost of misaligned accesses.
1832      - The cost of peeling (the extra runtime checks, the increase
1833        in code size).  */
1834
1835   FOR_EACH_VEC_ELT (datarefs, i, dr)
1836     {
1837       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1838       if (!vect_relevant_for_alignment_p (dr_info))
1839         continue;
1840
1841       stmt_vec_info stmt_info = dr_info->stmt;
1842       supportable_dr_alignment
1843         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1844       do_peeling = vector_alignment_reachable_p (dr_info);
1845       if (do_peeling)
1846         {
1847           if (known_alignment_for_access_p (dr_info))
1848             {
1849               unsigned int npeel_tmp = 0;
1850               bool negative = tree_int_cst_compare (DR_STEP (dr),
1851                                                     size_zero_node) < 0;
1852
1853               /* If known_alignment_for_access_p then we have set
1854                  DR_MISALIGNMENT which is only done if we know it at compiler
1855                  time, so it is safe to assume target alignment is constant.
1856                */
1857               unsigned int target_align =
1858                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1859               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1860               mis = (negative
1861                      ? DR_MISALIGNMENT (dr_info)
1862                      : -DR_MISALIGNMENT (dr_info));
1863               if (DR_MISALIGNMENT (dr_info) != 0)
1864                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1865
1866               /* For multiple types, it is possible that the bigger type access
1867                  will have more than one peeling option.  E.g., a loop with two
1868                  types: one of size (vector size / 4), and the other one of
1869                  size (vector size / 8).  Vectorization factor will 8.  If both
1870                  accesses are misaligned by 3, the first one needs one scalar
1871                  iteration to be aligned, and the second one needs 5.  But the
1872                  first one will be aligned also by peeling 5 scalar
1873                  iterations, and in that case both accesses will be aligned.
1874                  Hence, except for the immediate peeling amount, we also want
1875                  to try to add full vector size, while we don't exceed
1876                  vectorization factor.
1877                  We do this automatically for cost model, since we calculate
1878                  cost for every peeling option.  */
1879               poly_uint64 nscalars = npeel_tmp;
1880               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1881                 {
1882                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1883                   nscalars = (STMT_SLP_TYPE (stmt_info)
1884                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1885                 }
1886
1887               /* Save info about DR in the hash table.  Also include peeling
1888                  amounts according to the explanation above.  */
1889               while (known_le (npeel_tmp, nscalars))
1890                 {
1891                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1892                                             dr_info, npeel_tmp);
1893                   npeel_tmp += MAX (1, target_align / dr_size);
1894                 }
1895
1896               one_misalignment_known = true;
1897             }
1898           else
1899             {
1900               /* If we don't know any misalignment values, we prefer
1901                  peeling for data-ref that has the maximum number of data-refs
1902                  with the same alignment, unless the target prefers to align
1903                  stores over load.  */
1904               unsigned same_align_drs = n_same_align_refs[i];
1905               if (!dr0_info
1906                   || dr0_same_align_drs < same_align_drs)
1907                 {
1908                   dr0_same_align_drs = same_align_drs;
1909                   dr0_info = dr_info;
1910                 }
1911               /* For data-refs with the same number of related
1912                  accesses prefer the one where the misalign
1913                  computation will be invariant in the outermost loop.  */
1914               else if (dr0_same_align_drs == same_align_drs)
1915                 {
1916                   class loop *ivloop0, *ivloop;
1917                   ivloop0 = outermost_invariant_loop_for_expr
1918                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1919                   ivloop = outermost_invariant_loop_for_expr
1920                     (loop, DR_BASE_ADDRESS (dr));
1921                   if ((ivloop && !ivloop0)
1922                       || (ivloop && ivloop0
1923                           && flow_loop_nested_p (ivloop, ivloop0)))
1924                     dr0_info = dr_info;
1925                 }
1926
1927               one_misalignment_unknown = true;
1928
1929               /* Check for data refs with unsupportable alignment that
1930                  can be peeled.  */
1931               if (!supportable_dr_alignment)
1932               {
1933                 one_dr_unsupportable = true;
1934                 unsupportable_dr_info = dr_info;
1935               }
1936
1937               if (!first_store && DR_IS_WRITE (dr))
1938                 {
1939                   first_store = dr_info;
1940                   first_store_same_align_drs = same_align_drs;
1941                 }
1942             }
1943         }
1944       else
1945         {
1946           if (!aligned_access_p (dr_info))
1947             {
1948               if (dump_enabled_p ())
1949                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1950                                  "vector alignment may not be reachable\n");
1951               break;
1952             }
1953         }
1954     }
1955
1956   /* Check if we can possibly peel the loop.  */
1957   if (!vect_can_advance_ivs_p (loop_vinfo)
1958       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1959       || loop->inner)
1960     do_peeling = false;
1961
1962   struct _vect_peel_extended_info peel_for_known_alignment;
1963   struct _vect_peel_extended_info peel_for_unknown_alignment;
1964   struct _vect_peel_extended_info best_peel;
1965
1966   peel_for_unknown_alignment.inside_cost = INT_MAX;
1967   peel_for_unknown_alignment.outside_cost = INT_MAX;
1968   peel_for_unknown_alignment.peel_info.count = 0;
1969
1970   if (do_peeling
1971       && one_misalignment_unknown)
1972     {
1973       /* Check if the target requires to prefer stores over loads, i.e., if
1974          misaligned stores are more expensive than misaligned loads (taking
1975          drs with same alignment into account).  */
1976       unsigned int load_inside_cost = 0;
1977       unsigned int load_outside_cost = 0;
1978       unsigned int store_inside_cost = 0;
1979       unsigned int store_outside_cost = 0;
1980       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1981
1982       stmt_vector_for_cost dummy;
1983       dummy.create (2);
1984       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1985                                       &load_inside_cost,
1986                                       &load_outside_cost,
1987                                       &dummy, &dummy, estimated_npeels, true);
1988       dummy.release ();
1989
1990       if (first_store)
1991         {
1992           dummy.create (2);
1993           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1994                                           &store_inside_cost,
1995                                           &store_outside_cost,
1996                                           &dummy, &dummy,
1997                                           estimated_npeels, true);
1998           dummy.release ();
1999         }
2000       else
2001         {
2002           store_inside_cost = INT_MAX;
2003           store_outside_cost = INT_MAX;
2004         }
2005
2006       if (load_inside_cost > store_inside_cost
2007           || (load_inside_cost == store_inside_cost
2008               && load_outside_cost > store_outside_cost))
2009         {
2010           dr0_info = first_store;
2011           dr0_same_align_drs = first_store_same_align_drs;
2012           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2013           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2014         }
2015       else
2016         {
2017           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2018           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2019         }
2020
2021       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2022       prologue_cost_vec.create (2);
2023       epilogue_cost_vec.create (2);
2024
2025       int dummy2;
2026       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2027         (loop_vinfo, estimated_npeels, &dummy2,
2028          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2029          &prologue_cost_vec, &epilogue_cost_vec);
2030
2031       prologue_cost_vec.release ();
2032       epilogue_cost_vec.release ();
2033
2034       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2035     }
2036
2037   peel_for_unknown_alignment.peel_info.npeel = 0;
2038   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2039
2040   best_peel = peel_for_unknown_alignment;
2041
2042   peel_for_known_alignment.inside_cost = INT_MAX;
2043   peel_for_known_alignment.outside_cost = INT_MAX;
2044   peel_for_known_alignment.peel_info.count = 0;
2045   peel_for_known_alignment.peel_info.dr_info = NULL;
2046
2047   if (do_peeling && one_misalignment_known)
2048     {
2049       /* Peeling is possible, but there is no data access that is not supported
2050          unless aligned.  So we try to choose the best possible peeling from
2051          the hash table.  */
2052       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2053         (&peeling_htab, loop_vinfo);
2054     }
2055
2056   /* Compare costs of peeling for known and unknown alignment. */
2057   if (peel_for_known_alignment.peel_info.dr_info != NULL
2058       && peel_for_unknown_alignment.inside_cost
2059       >= peel_for_known_alignment.inside_cost)
2060     {
2061       best_peel = peel_for_known_alignment;
2062
2063       /* If the best peeling for known alignment has NPEEL == 0, perform no
2064          peeling at all except if there is an unsupportable dr that we can
2065          align.  */
2066       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2067         do_peeling = false;
2068     }
2069
2070   /* If there is an unsupportable data ref, prefer this over all choices so far
2071      since we'd have to discard a chosen peeling except when it accidentally
2072      aligned the unsupportable data ref.  */
2073   if (one_dr_unsupportable)
2074     dr0_info = unsupportable_dr_info;
2075   else if (do_peeling)
2076     {
2077       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2078          TODO: Use nopeel_outside_cost or get rid of it?  */
2079       unsigned nopeel_inside_cost = 0;
2080       unsigned nopeel_outside_cost = 0;
2081
2082       stmt_vector_for_cost dummy;
2083       dummy.create (2);
2084       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2085                                       &nopeel_outside_cost, &dummy, &dummy,
2086                                       0, false);
2087       dummy.release ();
2088
2089       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2090          costs will be recorded.  */
2091       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2092       prologue_cost_vec.create (2);
2093       epilogue_cost_vec.create (2);
2094
2095       int dummy2;
2096       nopeel_outside_cost += vect_get_known_peeling_cost
2097         (loop_vinfo, 0, &dummy2,
2098          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2099          &prologue_cost_vec, &epilogue_cost_vec);
2100
2101       prologue_cost_vec.release ();
2102       epilogue_cost_vec.release ();
2103
2104       npeel = best_peel.peel_info.npeel;
2105       dr0_info = best_peel.peel_info.dr_info;
2106
2107       /* If no peeling is not more expensive than the best peeling we
2108          have so far, don't perform any peeling.  */
2109       if (nopeel_inside_cost <= best_peel.inside_cost)
2110         do_peeling = false;
2111     }
2112
2113   if (do_peeling)
2114     {
2115       stmt_vec_info stmt_info = dr0_info->stmt;
2116       if (known_alignment_for_access_p (dr0_info))
2117         {
2118           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2119                                                 size_zero_node) < 0;
2120           if (!npeel)
2121             {
2122               /* Since it's known at compile time, compute the number of
2123                  iterations in the peeled loop (the peeling factor) for use in
2124                  updating DR_MISALIGNMENT values.  The peeling factor is the
2125                  vectorization factor minus the misalignment as an element
2126                  count.  */
2127               mis = (negative
2128                      ? DR_MISALIGNMENT (dr0_info)
2129                      : -DR_MISALIGNMENT (dr0_info));
2130               /* If known_alignment_for_access_p then we have set
2131                  DR_MISALIGNMENT which is only done if we know it at compiler
2132                  time, so it is safe to assume target alignment is constant.
2133                */
2134               unsigned int target_align =
2135                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2136               npeel = ((mis & (target_align - 1))
2137                        / vect_get_scalar_dr_size (dr0_info));
2138             }
2139
2140           /* For interleaved data access every iteration accesses all the
2141              members of the group, therefore we divide the number of iterations
2142              by the group size.  */
2143           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2144             npeel /= DR_GROUP_SIZE (stmt_info);
2145
2146           if (dump_enabled_p ())
2147             dump_printf_loc (MSG_NOTE, vect_location,
2148                              "Try peeling by %d\n", npeel);
2149         }
2150
2151       /* Ensure that all datarefs can be vectorized after the peel.  */
2152       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2153         do_peeling = false;
2154
2155       /* Check if all datarefs are supportable and log.  */
2156       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2157         return opt_result::success ();
2158
2159       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2160       if (do_peeling)
2161         {
2162           unsigned max_allowed_peel
2163             = param_vect_max_peeling_for_alignment;
2164           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2165             max_allowed_peel = 0;
2166           if (max_allowed_peel != (unsigned)-1)
2167             {
2168               unsigned max_peel = npeel;
2169               if (max_peel == 0)
2170                 {
2171                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2172                   unsigned HOST_WIDE_INT target_align_c;
2173                   if (target_align.is_constant (&target_align_c))
2174                     max_peel =
2175                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2176                   else
2177                     {
2178                       do_peeling = false;
2179                       if (dump_enabled_p ())
2180                         dump_printf_loc (MSG_NOTE, vect_location,
2181                           "Disable peeling, max peels set and vector"
2182                           " alignment unknown\n");
2183                     }
2184                 }
2185               if (max_peel > max_allowed_peel)
2186                 {
2187                   do_peeling = false;
2188                   if (dump_enabled_p ())
2189                     dump_printf_loc (MSG_NOTE, vect_location,
2190                         "Disable peeling, max peels reached: %d\n", max_peel);
2191                 }
2192             }
2193         }
2194
2195       /* Cost model #2 - if peeling may result in a remaining loop not
2196          iterating enough to be vectorized then do not peel.  Since this
2197          is a cost heuristic rather than a correctness decision, use the
2198          most likely runtime value for variable vectorization factors.  */
2199       if (do_peeling
2200           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2201         {
2202           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2203           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2204           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2205               < assumed_vf + max_peel)
2206             do_peeling = false;
2207         }
2208
2209       if (do_peeling)
2210         {
2211           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2212              If the misalignment of DR_i is identical to that of dr0 then set
2213              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2214              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2215              by the peeling factor times the element size of DR_i (MOD the
2216              vectorization factor times the size).  Otherwise, the
2217              misalignment of DR_i must be set to unknown.  */
2218           FOR_EACH_VEC_ELT (datarefs, i, dr)
2219             if (dr != dr0_info->dr)
2220               {
2221                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2222                 if (!vect_relevant_for_alignment_p (dr_info))
2223                   continue;
2224
2225                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2226               }
2227
2228           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2229           if (npeel)
2230             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2231           else
2232             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2233               = DR_MISALIGNMENT (dr0_info);
2234           SET_DR_MISALIGNMENT (dr0_info, 0);
2235           if (dump_enabled_p ())
2236             {
2237               dump_printf_loc (MSG_NOTE, vect_location,
2238                                "Alignment of access forced using peeling.\n");
2239               dump_printf_loc (MSG_NOTE, vect_location,
2240                                "Peeling for alignment will be applied.\n");
2241             }
2242
2243           /* The inside-loop cost will be accounted for in vectorizable_load
2244              and vectorizable_store correctly with adjusted alignments.
2245              Drop the body_cst_vec on the floor here.  */
2246           return opt_result::success ();
2247         }
2248     }
2249
2250   /* (2) Versioning to force alignment.  */
2251
2252   /* Try versioning if:
2253      1) optimize loop for speed and the cost-model is not cheap
2254      2) there is at least one unsupported misaligned data ref with an unknown
2255         misalignment, and
2256      3) all misaligned data refs with a known misalignment are supported, and
2257      4) the number of runtime alignment checks is within reason.  */
2258
2259   do_versioning
2260     = (optimize_loop_nest_for_speed_p (loop)
2261        && !loop->inner /* FORNOW */
2262        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2263
2264   if (do_versioning)
2265     {
2266       FOR_EACH_VEC_ELT (datarefs, i, dr)
2267         {
2268           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2269           if (aligned_access_p (dr_info)
2270               || !vect_relevant_for_alignment_p (dr_info))
2271             continue;
2272
2273           stmt_vec_info stmt_info = dr_info->stmt;
2274           if (STMT_VINFO_STRIDED_P (stmt_info))
2275             {
2276               do_versioning = false;
2277               break;
2278             }
2279
2280           supportable_dr_alignment
2281             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2282
2283           if (!supportable_dr_alignment)
2284             {
2285               int mask;
2286               tree vectype;
2287
2288               if (known_alignment_for_access_p (dr_info)
2289                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2290                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2291                 {
2292                   do_versioning = false;
2293                   break;
2294                 }
2295
2296               vectype = STMT_VINFO_VECTYPE (stmt_info);
2297               gcc_assert (vectype);
2298
2299               /* At present we don't support versioning for alignment
2300                  with variable VF, since there's no guarantee that the
2301                  VF is a power of two.  We could relax this if we added
2302                  a way of enforcing a power-of-two size.  */
2303               unsigned HOST_WIDE_INT size;
2304               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2305                 {
2306                   do_versioning = false;
2307                   break;
2308                 }
2309
2310               /* Forcing alignment in the first iteration is no good if
2311                  we don't keep it across iterations.  For now, just disable
2312                  versioning in this case.
2313                  ?? We could actually unroll the loop to achieve the required
2314                  overall step alignment, and forcing the alignment could be
2315                  done by doing some iterations of the non-vectorized loop.  */
2316               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2317                                * DR_STEP_ALIGNMENT (dr),
2318                                DR_TARGET_ALIGNMENT (dr_info)))
2319                 {
2320                   do_versioning = false;
2321                   break;
2322                 }
2323
2324               /* The rightmost bits of an aligned address must be zeros.
2325                  Construct the mask needed for this test.  For example,
2326                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2327                  mask must be 15 = 0xf. */
2328               mask = size - 1;
2329
2330               /* FORNOW: use the same mask to test all potentially unaligned
2331                  references in the loop.  */
2332               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2333                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2334                 {
2335                   do_versioning = false;
2336                   break;
2337                 }
2338
2339               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2340               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2341             }
2342         }
2343
2344       /* Versioning requires at least one misaligned data reference.  */
2345       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2346         do_versioning = false;
2347       else if (!do_versioning)
2348         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2349     }
2350
2351   if (do_versioning)
2352     {
2353       vec<stmt_vec_info> may_misalign_stmts
2354         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2355       stmt_vec_info stmt_info;
2356
2357       /* It can now be assumed that the data references in the statements
2358          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2359          of the loop being vectorized.  */
2360       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2361         {
2362           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2363           SET_DR_MISALIGNMENT (dr_info, 0);
2364           if (dump_enabled_p ())
2365             dump_printf_loc (MSG_NOTE, vect_location,
2366                              "Alignment of access forced using versioning.\n");
2367         }
2368
2369       if (dump_enabled_p ())
2370         dump_printf_loc (MSG_NOTE, vect_location,
2371                          "Versioning for alignment will be applied.\n");
2372
2373       /* Peeling and versioning can't be done together at this time.  */
2374       gcc_assert (! (do_peeling && do_versioning));
2375
2376       return opt_result::success ();
2377     }
2378
2379   /* This point is reached if neither peeling nor versioning is being done.  */
2380   gcc_assert (! (do_peeling || do_versioning));
2381
2382   return opt_result::success ();
2383 }
2384
2385
2386 /* Function vect_analyze_data_refs_alignment
2387
2388    Analyze the alignment of the data-references in the loop.
2389    Return FALSE if a data reference is found that cannot be vectorized.  */
2390
2391 opt_result
2392 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2393 {
2394   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2395
2396   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2397   struct data_reference *dr;
2398   unsigned int i;
2399
2400   vect_record_base_alignments (loop_vinfo);
2401   FOR_EACH_VEC_ELT (datarefs, i, dr)
2402     {
2403       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2404       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2405         vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2406     }
2407
2408   return opt_result::success ();
2409 }
2410
2411
2412 /* Analyze alignment of DRs of stmts in NODE.  */
2413
2414 static bool
2415 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2416 {
2417   /* We vectorize from the first scalar stmt in the node unless
2418      the node is permuted in which case we start from the first
2419      element in the group.  */
2420   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2421   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2422   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2423     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2424
2425   /* We need to commit to a vector type for the group now.  */
2426   if (is_a <bb_vec_info> (vinfo)
2427       && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2428     {
2429       if (dump_enabled_p ())
2430         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2431                          "desired vector type conflicts with earlier one "
2432                          "for %G", first_stmt_info->stmt);
2433       return false;
2434     }
2435
2436   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2437   vect_compute_data_ref_alignment (vinfo, dr_info);
2438   /* In several places we need alignment of the first element anyway.  */
2439   if (dr_info != first_dr_info)
2440     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2441
2442   /* For creating the data-ref pointer we need alignment of the
2443      first element as well.  */
2444   first_stmt_info
2445     = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2446   if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2447     {
2448       first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2449       if (dr_info != first_dr_info)
2450         vect_compute_data_ref_alignment (vinfo, first_dr_info);
2451     }
2452
2453   return true;
2454 }
2455
2456 /* Function vect_slp_analyze_instance_alignment
2457
2458    Analyze the alignment of the data-references in the SLP instance.
2459    Return FALSE if a data reference is found that cannot be vectorized.  */
2460
2461 bool
2462 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2463                                                 slp_instance instance)
2464 {
2465   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2466
2467   slp_tree node;
2468   unsigned i;
2469   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2470     if (! vect_slp_analyze_node_alignment (vinfo, node))
2471       return false;
2472
2473   node = SLP_INSTANCE_TREE (instance);
2474   if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2475       && ! vect_slp_analyze_node_alignment
2476              (vinfo, SLP_INSTANCE_TREE (instance)))
2477     return false;
2478
2479   return true;
2480 }
2481
2482
2483 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2484    accesses of legal size, step, etc.  Detect gaps, single element
2485    interleaving, and other special cases. Set grouped access info.
2486    Collect groups of strided stores for further use in SLP analysis.
2487    Worker for vect_analyze_group_access.  */
2488
2489 static bool
2490 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2491 {
2492   data_reference *dr = dr_info->dr;
2493   tree step = DR_STEP (dr);
2494   tree scalar_type = TREE_TYPE (DR_REF (dr));
2495   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2496   stmt_vec_info stmt_info = dr_info->stmt;
2497   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2498   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2499   HOST_WIDE_INT dr_step = -1;
2500   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2501   bool slp_impossible = false;
2502
2503   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2504      size of the interleaving group (including gaps).  */
2505   if (tree_fits_shwi_p (step))
2506     {
2507       dr_step = tree_to_shwi (step);
2508       /* Check that STEP is a multiple of type size.  Otherwise there is
2509          a non-element-sized gap at the end of the group which we
2510          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2511          ???  As we can handle non-constant step fine here we should
2512          simply remove uses of DR_GROUP_GAP between the last and first
2513          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2514          simply not include that gap.  */
2515       if ((dr_step % type_size) != 0)
2516         {
2517           if (dump_enabled_p ())
2518             dump_printf_loc (MSG_NOTE, vect_location,
2519                              "Step %T is not a multiple of the element size"
2520                              " for %T\n",
2521                              step, DR_REF (dr));
2522           return false;
2523         }
2524       groupsize = absu_hwi (dr_step) / type_size;
2525     }
2526   else
2527     groupsize = 0;
2528
2529   /* Not consecutive access is possible only if it is a part of interleaving.  */
2530   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2531     {
2532       /* Check if it this DR is a part of interleaving, and is a single
2533          element of the group that is accessed in the loop.  */
2534
2535       /* Gaps are supported only for loads. STEP must be a multiple of the type
2536          size.  */
2537       if (DR_IS_READ (dr)
2538           && (dr_step % type_size) == 0
2539           && groupsize > 0)
2540         {
2541           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2542           DR_GROUP_SIZE (stmt_info) = groupsize;
2543           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2544           if (dump_enabled_p ())
2545             dump_printf_loc (MSG_NOTE, vect_location,
2546                              "Detected single element interleaving %T"
2547                              " step %T\n",
2548                              DR_REF (dr), step);
2549
2550           return true;
2551         }
2552
2553       if (dump_enabled_p ())
2554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                          "not consecutive access %G", stmt_info->stmt);
2556
2557       if (bb_vinfo)
2558         {
2559           /* Mark the statement as unvectorizable.  */
2560           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2561           return true;
2562         }
2563
2564       if (dump_enabled_p ())
2565         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566       STMT_VINFO_STRIDED_P (stmt_info) = true;
2567       return true;
2568     }
2569
2570   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2571     {
2572       /* First stmt in the interleaving chain. Check the chain.  */
2573       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2574       struct data_reference *data_ref = dr;
2575       unsigned int count = 1;
2576       tree prev_init = DR_INIT (data_ref);
2577       HOST_WIDE_INT diff, gaps = 0;
2578
2579       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2580       while (next)
2581         {
2582           /* We never have the same DR multiple times.  */
2583           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2584                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2585
2586           data_ref = STMT_VINFO_DATA_REF (next);
2587
2588           /* All group members have the same STEP by construction.  */
2589           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590
2591           /* Check that the distance between two accesses is equal to the type
2592              size. Otherwise, we have gaps.  */
2593           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2595           if (diff != 1)
2596             {
2597               /* FORNOW: SLP of accesses with gaps is not supported.  */
2598               slp_impossible = true;
2599               if (DR_IS_WRITE (data_ref))
2600                 {
2601                   if (dump_enabled_p ())
2602                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603                                      "interleaved store with gaps\n");
2604                   return false;
2605                 }
2606
2607               gaps += diff - 1;
2608             }
2609
2610           last_accessed_element += diff;
2611
2612           /* Store the gap from the previous member of the group. If there is no
2613              gap in the access, DR_GROUP_GAP is always 1.  */
2614           DR_GROUP_GAP (next) = diff;
2615
2616           prev_init = DR_INIT (data_ref);
2617           next = DR_GROUP_NEXT_ELEMENT (next);
2618           /* Count the number of data-refs in the chain.  */
2619           count++;
2620         }
2621
2622       if (groupsize == 0)
2623         groupsize = count + gaps;
2624
2625       /* This could be UINT_MAX but as we are generating code in a very
2626          inefficient way we have to cap earlier.  See PR78699 for example.  */
2627       if (groupsize > 4096)
2628         {
2629           if (dump_enabled_p ())
2630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                              "group is too large\n");
2632           return false;
2633         }
2634
2635       /* Check that the size of the interleaving is equal to count for stores,
2636          i.e., that there are no gaps.  */
2637       if (groupsize != count
2638           && !DR_IS_READ (dr))
2639         {
2640           groupsize = count;
2641           STMT_VINFO_STRIDED_P (stmt_info) = true;
2642         }
2643
2644       /* If there is a gap after the last load in the group it is the
2645          difference between the groupsize and the last accessed
2646          element.
2647          When there is no gap, this difference should be 0.  */
2648       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2649
2650       DR_GROUP_SIZE (stmt_info) = groupsize;
2651       if (dump_enabled_p ())
2652         {
2653           dump_printf_loc (MSG_NOTE, vect_location,
2654                            "Detected interleaving ");
2655           if (DR_IS_READ (dr))
2656             dump_printf (MSG_NOTE, "load ");
2657           else if (STMT_VINFO_STRIDED_P (stmt_info))
2658             dump_printf (MSG_NOTE, "strided store ");
2659           else
2660             dump_printf (MSG_NOTE, "store ");
2661           dump_printf (MSG_NOTE, "of size %u\n",
2662                        (unsigned)groupsize);
2663           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2664           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2665           while (next)
2666             {
2667               if (DR_GROUP_GAP (next) != 1)
2668                 dump_printf_loc (MSG_NOTE, vect_location,
2669                                  "\t<gap of %d elements>\n",
2670                                  DR_GROUP_GAP (next) - 1);
2671               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2672               next = DR_GROUP_NEXT_ELEMENT (next);
2673             }
2674           if (DR_GROUP_GAP (stmt_info) != 0)
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "\t<gap of %d elements>\n",
2677                              DR_GROUP_GAP (stmt_info));
2678         }
2679
2680       /* SLP: create an SLP data structure for every interleaving group of
2681          stores for further analysis in vect_analyse_slp.  */
2682       if (DR_IS_WRITE (dr) && !slp_impossible)
2683         {
2684           if (loop_vinfo)
2685             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2686           if (bb_vinfo)
2687             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2688         }
2689     }
2690
2691   return true;
2692 }
2693
2694 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2695    accesses of legal size, step, etc.  Detect gaps, single element
2696    interleaving, and other special cases. Set grouped access info.
2697    Collect groups of strided stores for further use in SLP analysis.  */
2698
2699 static bool
2700 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2701 {
2702   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2703     {
2704       /* Dissolve the group if present.  */
2705       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2706       while (stmt_info)
2707         {
2708           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2709           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2710           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2711           stmt_info = next;
2712         }
2713       return false;
2714     }
2715   return true;
2716 }
2717
2718 /* Analyze the access pattern of the data-reference DR_INFO.
2719    In case of non-consecutive accesses call vect_analyze_group_access() to
2720    analyze groups of accesses.  */
2721
2722 static bool
2723 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2724 {
2725   data_reference *dr = dr_info->dr;
2726   tree step = DR_STEP (dr);
2727   tree scalar_type = TREE_TYPE (DR_REF (dr));
2728   stmt_vec_info stmt_info = dr_info->stmt;
2729   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2730   class loop *loop = NULL;
2731
2732   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2733     return true;
2734
2735   if (loop_vinfo)
2736     loop = LOOP_VINFO_LOOP (loop_vinfo);
2737
2738   if (loop_vinfo && !step)
2739     {
2740       if (dump_enabled_p ())
2741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742                          "bad data-ref access in loop\n");
2743       return false;
2744     }
2745
2746   /* Allow loads with zero step in inner-loop vectorization.  */
2747   if (loop_vinfo && integer_zerop (step))
2748     {
2749       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2750       if (!nested_in_vect_loop_p (loop, stmt_info))
2751         return DR_IS_READ (dr);
2752       /* Allow references with zero step for outer loops marked
2753          with pragma omp simd only - it guarantees absence of
2754          loop-carried dependencies between inner loop iterations.  */
2755       if (loop->safelen < 2)
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_NOTE, vect_location,
2759                              "zero step in inner loop of nest\n");
2760           return false;
2761         }
2762     }
2763
2764   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2765     {
2766       /* Interleaved accesses are not yet supported within outer-loop
2767         vectorization for references in the inner-loop.  */
2768       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2769
2770       /* For the rest of the analysis we use the outer-loop step.  */
2771       step = STMT_VINFO_DR_STEP (stmt_info);
2772       if (integer_zerop (step))
2773         {
2774           if (dump_enabled_p ())
2775             dump_printf_loc (MSG_NOTE, vect_location,
2776                              "zero step in outer loop.\n");
2777           return DR_IS_READ (dr);
2778         }
2779     }
2780
2781   /* Consecutive?  */
2782   if (TREE_CODE (step) == INTEGER_CST)
2783     {
2784       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2785       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2786           || (dr_step < 0
2787               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2788         {
2789           /* Mark that it is not interleaving.  */
2790           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2791           return true;
2792         }
2793     }
2794
2795   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2796     {
2797       if (dump_enabled_p ())
2798         dump_printf_loc (MSG_NOTE, vect_location,
2799                          "grouped access in outer loop.\n");
2800       return false;
2801     }
2802
2803
2804   /* Assume this is a DR handled by non-constant strided load case.  */
2805   if (TREE_CODE (step) != INTEGER_CST)
2806     return (STMT_VINFO_STRIDED_P (stmt_info)
2807             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2808                 || vect_analyze_group_access (vinfo, dr_info)));
2809
2810   /* Not consecutive access - check if it's a part of interleaving group.  */
2811   return vect_analyze_group_access (vinfo, dr_info);
2812 }
2813
2814 typedef std::pair<data_reference_p, int> data_ref_pair;
2815
2816 /* Compare two data-references DRA and DRB to group them into chunks
2817    suitable for grouping.  */
2818
2819 static int
2820 dr_group_sort_cmp (const void *dra_, const void *drb_)
2821 {
2822   data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2823   data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2824   data_reference_p dra = dra_pair.first;
2825   data_reference_p drb = drb_pair.first;
2826   int cmp;
2827
2828   /* Stabilize sort.  */
2829   if (dra == drb)
2830     return 0;
2831
2832   /* DRs in different basic-blocks never belong to the same group.  */
2833   int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2834   int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2835   if (bb_index1 != bb_index2)
2836     return bb_index1 < bb_index2 ? -1 : 1;
2837
2838   /* Different group IDs lead never belong to the same group.  */
2839   if (dra_pair.second != drb_pair.second)
2840     return dra_pair.second < drb_pair.second ? -1 : 1;
2841
2842   /* Ordering of DRs according to base.  */
2843   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844                                DR_BASE_ADDRESS (drb));
2845   if (cmp != 0)
2846     return cmp;
2847
2848   /* And according to DR_OFFSET.  */
2849   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2850   if (cmp != 0)
2851     return cmp;
2852
2853   /* Put reads before writes.  */
2854   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2855     return DR_IS_READ (dra) ? -1 : 1;
2856
2857   /* Then sort after access size.  */
2858   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2859                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2860   if (cmp != 0)
2861     return cmp;
2862
2863   /* And after step.  */
2864   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2865   if (cmp != 0)
2866     return cmp;
2867
2868   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2869   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2870   if (cmp == 0)
2871     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2872   return cmp;
2873 }
2874
2875 /* If OP is the result of a conversion, return the unconverted value,
2876    otherwise return null.  */
2877
2878 static tree
2879 strip_conversion (tree op)
2880 {
2881   if (TREE_CODE (op) != SSA_NAME)
2882     return NULL_TREE;
2883   gimple *stmt = SSA_NAME_DEF_STMT (op);
2884   if (!is_gimple_assign (stmt)
2885       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2886     return NULL_TREE;
2887   return gimple_assign_rhs1 (stmt);
2888 }
2889
2890 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2891    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2892    be grouped in SLP mode.  */
2893
2894 static bool
2895 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2896                    bool allow_slp_p)
2897 {
2898   if (gimple_assign_single_p (stmt1_info->stmt))
2899     return gimple_assign_single_p (stmt2_info->stmt);
2900
2901   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2902   if (call1 && gimple_call_internal_p (call1))
2903     {
2904       /* Check for two masked loads or two masked stores.  */
2905       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2906       if (!call2 || !gimple_call_internal_p (call2))
2907         return false;
2908       internal_fn ifn = gimple_call_internal_fn (call1);
2909       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2910         return false;
2911       if (ifn != gimple_call_internal_fn (call2))
2912         return false;
2913
2914       /* Check that the masks are the same.  Cope with casts of masks,
2915          like those created by build_mask_conversion.  */
2916       tree mask1 = gimple_call_arg (call1, 2);
2917       tree mask2 = gimple_call_arg (call2, 2);
2918       if (!operand_equal_p (mask1, mask2, 0)
2919           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2920         {
2921           mask1 = strip_conversion (mask1);
2922           if (!mask1)
2923             return false;
2924           mask2 = strip_conversion (mask2);
2925           if (!mask2)
2926             return false;
2927           if (!operand_equal_p (mask1, mask2, 0))
2928             return false;
2929         }
2930       return true;
2931     }
2932
2933   return false;
2934 }
2935
2936 /* Function vect_analyze_data_ref_accesses.
2937
2938    Analyze the access pattern of all the data references in the loop.
2939
2940    FORNOW: the only access pattern that is considered vectorizable is a
2941            simple step 1 (consecutive) access.
2942
2943    FORNOW: handle only arrays and pointer accesses.  */
2944
2945 opt_result
2946 vect_analyze_data_ref_accesses (vec_info *vinfo,
2947                                 vec<int> *dataref_groups)
2948 {
2949   unsigned int i;
2950   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2951
2952   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2953
2954   if (datarefs.is_empty ())
2955     return opt_result::success ();
2956
2957   /* Sort the array of datarefs to make building the interleaving chains
2958      linear.  Don't modify the original vector's order, it is needed for
2959      determining what dependencies are reversed.  */
2960   vec<data_ref_pair> datarefs_copy;
2961   datarefs_copy.create (datarefs.length ());
2962   for (unsigned i = 0; i < datarefs.length (); i++)
2963     {
2964       int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2965       datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2966     }
2967   datarefs_copy.qsort (dr_group_sort_cmp);
2968   hash_set<stmt_vec_info> to_fixup;
2969
2970   /* Build the interleaving chains.  */
2971   for (i = 0; i < datarefs_copy.length () - 1;)
2972     {
2973       data_reference_p dra = datarefs_copy[i].first;
2974       int dra_group_id = datarefs_copy[i].second;
2975       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2976       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2977       stmt_vec_info lastinfo = NULL;
2978       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2979           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2980         {
2981           ++i;
2982           continue;
2983         }
2984       for (i = i + 1; i < datarefs_copy.length (); ++i)
2985         {
2986           data_reference_p drb = datarefs_copy[i].first;
2987           int drb_group_id = datarefs_copy[i].second;
2988           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2989           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2990           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2991               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2992             break;
2993
2994           /* ???  Imperfect sorting (non-compatible types, non-modulo
2995              accesses, same accesses) can lead to a group to be artificially
2996              split here as we don't just skip over those.  If it really
2997              matters we can push those to a worklist and re-iterate
2998              over them.  The we can just skip ahead to the next DR here.  */
2999
3000           /* DRs in a different BBs should not be put into the same
3001              interleaving group.  */
3002           int bb_index1 = gimple_bb (DR_STMT (dra))->index;
3003           int bb_index2 = gimple_bb (DR_STMT (drb))->index;
3004           if (bb_index1 != bb_index2)
3005             break;
3006
3007           if (dra_group_id != drb_group_id)
3008             break;
3009
3010           /* Check that the data-refs have same first location (except init)
3011              and they are both either store or load (not load and store,
3012              not masked loads or stores).  */
3013           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3014               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3015                                         DR_BASE_ADDRESS (drb)) != 0
3016               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3017               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3018             break;
3019
3020           /* Check that the data-refs have the same constant size.  */
3021           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3022           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3023           if (!tree_fits_uhwi_p (sza)
3024               || !tree_fits_uhwi_p (szb)
3025               || !tree_int_cst_equal (sza, szb))
3026             break;
3027
3028           /* Check that the data-refs have the same step.  */
3029           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3030             break;
3031
3032           /* Check the types are compatible.
3033              ???  We don't distinguish this during sorting.  */
3034           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3035                                    TREE_TYPE (DR_REF (drb))))
3036             break;
3037
3038           /* Check that the DR_INITs are compile-time constants.  */
3039           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3040               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3041             break;
3042
3043           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3044              just hold extra information.  */
3045           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3046               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3047               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3048             break;
3049
3050           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3051           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3052           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3053           HOST_WIDE_INT init_prev
3054             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3055           gcc_assert (init_a <= init_b
3056                       && init_a <= init_prev
3057                       && init_prev <= init_b);
3058
3059           /* Do not place the same access in the interleaving chain twice.  */
3060           if (init_b == init_prev)
3061             {
3062               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3063                           < gimple_uid (DR_STMT (drb)));
3064               /* Simply link in duplicates and fix up the chain below.  */
3065             }
3066           else
3067             {
3068               /* If init_b == init_a + the size of the type * k, we have an
3069                  interleaving, and DRA is accessed before DRB.  */
3070               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3071               if (type_size_a == 0
3072                   || (init_b - init_a) % type_size_a != 0)
3073                 break;
3074
3075               /* If we have a store, the accesses are adjacent.  This splits
3076                  groups into chunks we support (we don't support vectorization
3077                  of stores with gaps).  */
3078               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3079                 break;
3080
3081               /* If the step (if not zero or non-constant) is smaller than the
3082                  difference between data-refs' inits this splits groups into
3083                  suitable sizes.  */
3084               if (tree_fits_shwi_p (DR_STEP (dra)))
3085                 {
3086                   unsigned HOST_WIDE_INT step
3087                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3088                   if (step != 0
3089                       && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3090                     break;
3091                 }
3092             }
3093
3094           if (dump_enabled_p ())
3095             dump_printf_loc (MSG_NOTE, vect_location,
3096                              DR_IS_READ (dra)
3097                              ? "Detected interleaving load %T and %T\n"
3098                              : "Detected interleaving store %T and %T\n",
3099                              DR_REF (dra), DR_REF (drb));
3100
3101           /* Link the found element into the group list.  */
3102           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3103             {
3104               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3105               lastinfo = stmtinfo_a;
3106             }
3107           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3108           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3109           lastinfo = stmtinfo_b;
3110
3111           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3112             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3113
3114           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3115             dump_printf_loc (MSG_NOTE, vect_location,
3116                              "Load suitable for SLP vectorization only.\n");
3117
3118           if (init_b == init_prev
3119               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3120               && dump_enabled_p ())
3121             dump_printf_loc (MSG_NOTE, vect_location,
3122                              "Queuing group with duplicate access for fixup\n");
3123         }
3124     }
3125
3126   /* Fixup groups with duplicate entries by splitting it.  */
3127   while (1)
3128     {
3129       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3130       if (!(it != to_fixup.end ()))
3131         break;
3132       stmt_vec_info grp = *it;
3133       to_fixup.remove (grp);
3134
3135       /* Find the earliest duplicate group member.  */
3136       unsigned first_duplicate = -1u;
3137       stmt_vec_info next, g = grp;
3138       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3139         {
3140           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3141                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3142               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3143             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3144           g = next;
3145         }
3146       if (first_duplicate == -1U)
3147         continue;
3148
3149       /* Then move all stmts after the first duplicate to a new group.
3150          Note this is a heuristic but one with the property that *it
3151          is fixed up completely.  */
3152       g = grp;
3153       stmt_vec_info newgroup = NULL, ng = grp;
3154       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3155         {
3156           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3157             {
3158               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3159               if (!newgroup)
3160                 newgroup = next;
3161               else
3162                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3163               ng = next;
3164               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3165             }
3166           else
3167             g = DR_GROUP_NEXT_ELEMENT (g);
3168         }
3169       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3170
3171       /* Fixup the new group which still may contain duplicates.  */
3172       to_fixup.add (newgroup);
3173     }
3174
3175   data_ref_pair *dr_pair;
3176   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3177     {
3178       dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3179       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3180           && !vect_analyze_data_ref_access (vinfo, dr_info))
3181         {
3182           if (dump_enabled_p ())
3183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3184                              "not vectorized: complicated access pattern.\n");
3185
3186           if (is_a <bb_vec_info> (vinfo))
3187             {
3188               /* Mark the statement as not vectorizable.  */
3189               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3190               continue;
3191             }
3192           else
3193             {
3194               datarefs_copy.release ();
3195               return opt_result::failure_at (dr_info->stmt->stmt,
3196                                              "not vectorized:"
3197                                              " complicated access pattern.\n");
3198             }
3199         }
3200     }
3201
3202   datarefs_copy.release ();
3203   return opt_result::success ();
3204 }
3205
3206 /* Function vect_vfa_segment_size.
3207
3208    Input:
3209      DR_INFO: The data reference.
3210      LENGTH_FACTOR: segment length to consider.
3211
3212    Return a value suitable for the dr_with_seg_len::seg_len field.
3213    This is the "distance travelled" by the pointer from the first
3214    iteration in the segment to the last.  Note that it does not include
3215    the size of the access; in effect it only describes the first byte.  */
3216
3217 static tree
3218 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3219 {
3220   length_factor = size_binop (MINUS_EXPR,
3221                               fold_convert (sizetype, length_factor),
3222                               size_one_node);
3223   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3224                      length_factor);
3225 }
3226
3227 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3228    gives the worst-case number of bytes covered by the segment.  */
3229
3230 static unsigned HOST_WIDE_INT
3231 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3232 {
3233   stmt_vec_info stmt_vinfo = dr_info->stmt;
3234   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3235   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3236   unsigned HOST_WIDE_INT access_size = ref_size;
3237   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3238     {
3239       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3240       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3241     }
3242   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3243       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3244           == dr_explicit_realign_optimized))
3245     {
3246       /* We might access a full vector's worth.  */
3247       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3248       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3249     }
3250   return access_size;
3251 }
3252
3253 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3254    describes.  */
3255
3256 static unsigned int
3257 vect_vfa_align (dr_vec_info *dr_info)
3258 {
3259   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3260 }
3261
3262 /* Function vect_no_alias_p.
3263
3264    Given data references A and B with equal base and offset, see whether
3265    the alias relation can be decided at compilation time.  Return 1 if
3266    it can and the references alias, 0 if it can and the references do
3267    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3268    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3269    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3270
3271 static int
3272 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3273                          tree segment_length_a, tree segment_length_b,
3274                          unsigned HOST_WIDE_INT access_size_a,
3275                          unsigned HOST_WIDE_INT access_size_b)
3276 {
3277   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3278   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3279   poly_uint64 const_length_a;
3280   poly_uint64 const_length_b;
3281
3282   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3283      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3284      [a, a+12) */
3285   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3286     {
3287       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3288       offset_a -= const_length_a;
3289     }
3290   else
3291     const_length_a = tree_to_poly_uint64 (segment_length_a);
3292   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3293     {
3294       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3295       offset_b -= const_length_b;
3296     }
3297   else
3298     const_length_b = tree_to_poly_uint64 (segment_length_b);
3299
3300   const_length_a += access_size_a;
3301   const_length_b += access_size_b;
3302
3303   if (ranges_known_overlap_p (offset_a, const_length_a,
3304                               offset_b, const_length_b))
3305     return 1;
3306
3307   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3308                                offset_b, const_length_b))
3309     return 0;
3310
3311   return -1;
3312 }
3313
3314 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3315    in DDR is >= VF.  */
3316
3317 static bool
3318 dependence_distance_ge_vf (data_dependence_relation *ddr,
3319                            unsigned int loop_depth, poly_uint64 vf)
3320 {
3321   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3322       || DDR_NUM_DIST_VECTS (ddr) == 0)
3323     return false;
3324
3325   /* If the dependence is exact, we should have limited the VF instead.  */
3326   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3327
3328   unsigned int i;
3329   lambda_vector dist_v;
3330   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3331     {
3332       HOST_WIDE_INT dist = dist_v[loop_depth];
3333       if (dist != 0
3334           && !(dist > 0 && DDR_REVERSED_P (ddr))
3335           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3336         return false;
3337     }
3338
3339   if (dump_enabled_p ())
3340     dump_printf_loc (MSG_NOTE, vect_location,
3341                      "dependence distance between %T and %T is >= VF\n",
3342                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3343
3344   return true;
3345 }
3346
3347 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3348
3349 static void
3350 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3351 {
3352   dump_printf (dump_kind, "%s (%T) >= ",
3353                lower_bound.unsigned_p ? "unsigned" : "abs",
3354                lower_bound.expr);
3355   dump_dec (dump_kind, lower_bound.min_value);
3356 }
3357
3358 /* Record that the vectorized loop requires the vec_lower_bound described
3359    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3360
3361 static void
3362 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3363                         poly_uint64 min_value)
3364 {
3365   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3366   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3367     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3368       {
3369         unsigned_p &= lower_bounds[i].unsigned_p;
3370         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3371         if (lower_bounds[i].unsigned_p != unsigned_p
3372             || maybe_lt (lower_bounds[i].min_value, min_value))
3373           {
3374             lower_bounds[i].unsigned_p = unsigned_p;
3375             lower_bounds[i].min_value = min_value;
3376             if (dump_enabled_p ())
3377               {
3378                 dump_printf_loc (MSG_NOTE, vect_location,
3379                                  "updating run-time check to ");
3380                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3381                 dump_printf (MSG_NOTE, "\n");
3382               }
3383           }
3384         return;
3385       }
3386
3387   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3388   if (dump_enabled_p ())
3389     {
3390       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3391       dump_lower_bound (MSG_NOTE, lower_bound);
3392       dump_printf (MSG_NOTE, "\n");
3393     }
3394   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3395 }
3396
3397 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3398    will span fewer than GAP bytes.  */
3399
3400 static bool
3401 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3402                   poly_int64 gap)
3403 {
3404   stmt_vec_info stmt_info = dr_info->stmt;
3405   HOST_WIDE_INT count
3406     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3407   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3408     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3409   return (estimated_poly_value (gap)
3410           <= count * vect_get_scalar_dr_size (dr_info));
3411 }
3412
3413 /* Return true if we know that there is no alias between DR_INFO_A and
3414    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3415    When returning true, set *LOWER_BOUND_OUT to this N.  */
3416
3417 static bool
3418 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3419                                 poly_uint64 *lower_bound_out)
3420 {
3421   /* Check that there is a constant gap of known sign between DR_A
3422      and DR_B.  */
3423   data_reference *dr_a = dr_info_a->dr;
3424   data_reference *dr_b = dr_info_b->dr;
3425   poly_int64 init_a, init_b;
3426   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3427       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3428       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3429       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3430       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3431       || !ordered_p (init_a, init_b))
3432     return false;
3433
3434   /* Sort DR_A and DR_B by the address they access.  */
3435   if (maybe_lt (init_b, init_a))
3436     {
3437       std::swap (init_a, init_b);
3438       std::swap (dr_info_a, dr_info_b);
3439       std::swap (dr_a, dr_b);
3440     }
3441
3442   /* If the two accesses could be dependent within a scalar iteration,
3443      make sure that we'd retain their order.  */
3444   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3445       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3446     return false;
3447
3448   /* There is no alias if abs (DR_STEP) is greater than or equal to
3449      the bytes spanned by the combination of the two accesses.  */
3450   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3451   return true;
3452 }
3453
3454 /* Function vect_prune_runtime_alias_test_list.
3455
3456    Prune a list of ddrs to be tested at run-time by versioning for alias.
3457    Merge several alias checks into one if possible.
3458    Return FALSE if resulting list of ddrs is longer then allowed by
3459    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3460
3461 opt_result
3462 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3463 {
3464   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3465   hash_set <tree_pair_hash> compared_objects;
3466
3467   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3468   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3469     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3470   vec<vec_object_pair> &check_unequal_addrs
3471     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3472   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3473   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3474
3475   ddr_p ddr;
3476   unsigned int i;
3477   tree length_factor;
3478
3479   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3480
3481   /* Step values are irrelevant for aliasing if the number of vector
3482      iterations is equal to the number of scalar iterations (which can
3483      happen for fully-SLP loops).  */
3484   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3485
3486   if (!ignore_step_p)
3487     {
3488       /* Convert the checks for nonzero steps into bound tests.  */
3489       tree value;
3490       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3491         vect_check_lower_bound (loop_vinfo, value, true, 1);
3492     }
3493
3494   if (may_alias_ddrs.is_empty ())
3495     return opt_result::success ();
3496
3497   comp_alias_ddrs.create (may_alias_ddrs.length ());
3498
3499   unsigned int loop_depth
3500     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3501                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3502
3503   /* First, we collect all data ref pairs for aliasing checks.  */
3504   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3505     {
3506       poly_uint64 lower_bound;
3507       tree segment_length_a, segment_length_b;
3508       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3509       unsigned int align_a, align_b;
3510
3511       /* Ignore the alias if the VF we chose ended up being no greater
3512          than the dependence distance.  */
3513       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3514         continue;
3515
3516       if (DDR_OBJECT_A (ddr))
3517         {
3518           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3519           if (!compared_objects.add (new_pair))
3520             {
3521               if (dump_enabled_p ())
3522                 dump_printf_loc (MSG_NOTE, vect_location,
3523                                  "checking that %T and %T"
3524                                  " have different addresses\n",
3525                                  new_pair.first, new_pair.second);
3526               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3527             }
3528           continue;
3529         }
3530
3531       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3532       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3533
3534       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3535       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3536
3537       bool preserves_scalar_order_p
3538         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3539
3540       /* Skip the pair if inter-iteration dependencies are irrelevant
3541          and intra-iteration dependencies are guaranteed to be honored.  */
3542       if (ignore_step_p
3543           && (preserves_scalar_order_p
3544               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3545                                                  &lower_bound)))
3546         {
3547           if (dump_enabled_p ())
3548             dump_printf_loc (MSG_NOTE, vect_location,
3549                              "no need for alias check between "
3550                              "%T and %T when VF is 1\n",
3551                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3552           continue;
3553         }
3554
3555       /* See whether we can handle the alias using a bounds check on
3556          the step, and whether that's likely to be the best approach.
3557          (It might not be, for example, if the minimum step is much larger
3558          than the number of bytes handled by one vector iteration.)  */
3559       if (!ignore_step_p
3560           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3561           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3562                                              &lower_bound)
3563           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3564               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3565         {
3566           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3567           if (dump_enabled_p ())
3568             {
3569               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3570                                "%T and %T when the step %T is outside ",
3571                                DR_REF (dr_info_a->dr),
3572                                DR_REF (dr_info_b->dr),
3573                                DR_STEP (dr_info_a->dr));
3574               if (unsigned_p)
3575                 dump_printf (MSG_NOTE, "[0");
3576               else
3577                 {
3578                   dump_printf (MSG_NOTE, "(");
3579                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3580                 }
3581               dump_printf (MSG_NOTE, ", ");
3582               dump_dec (MSG_NOTE, lower_bound);
3583               dump_printf (MSG_NOTE, ")\n");
3584             }
3585           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3586                                   unsigned_p, lower_bound);
3587           continue;
3588         }
3589
3590       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3591       if (dr_group_first_a)
3592         {
3593           stmt_info_a = dr_group_first_a;
3594           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3595         }
3596
3597       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3598       if (dr_group_first_b)
3599         {
3600           stmt_info_b = dr_group_first_b;
3601           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3602         }
3603
3604       if (ignore_step_p)
3605         {
3606           segment_length_a = size_zero_node;
3607           segment_length_b = size_zero_node;
3608         }
3609       else
3610         {
3611           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3612                                 DR_STEP (dr_info_b->dr), 0))
3613             length_factor = scalar_loop_iters;
3614           else
3615             length_factor = size_int (vect_factor);
3616           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3617           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3618         }
3619       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3620       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3621       align_a = vect_vfa_align (dr_info_a);
3622       align_b = vect_vfa_align (dr_info_b);
3623
3624       /* See whether the alias is known at compilation time.  */
3625       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3626                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3627           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3628                               DR_OFFSET (dr_info_b->dr), 0)
3629           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3630           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3631           && poly_int_tree_p (segment_length_a)
3632           && poly_int_tree_p (segment_length_b))
3633         {
3634           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3635                                              segment_length_a,
3636                                              segment_length_b,
3637                                              access_size_a,
3638                                              access_size_b);
3639           if (res >= 0 && dump_enabled_p ())
3640             {
3641               dump_printf_loc (MSG_NOTE, vect_location,
3642                                "can tell at compile time that %T and %T",
3643                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3644               if (res == 0)
3645                 dump_printf (MSG_NOTE, " do not alias\n");
3646               else
3647                 dump_printf (MSG_NOTE, " alias\n");
3648             }
3649
3650           if (res == 0)
3651             continue;
3652
3653           if (res == 1)
3654             return opt_result::failure_at (stmt_info_b->stmt,
3655                                            "not vectorized:"
3656                                            " compilation time alias: %G%G",
3657                                            stmt_info_a->stmt,
3658                                            stmt_info_b->stmt);
3659         }
3660
3661       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3662                             access_size_a, align_a);
3663       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3664                             access_size_b, align_b);
3665       /* Canonicalize the order to be the one that's needed for accurate
3666          RAW, WAR and WAW flags, in cases where the data references are
3667          well-ordered.  The order doesn't really matter otherwise,
3668          but we might as well be consistent.  */
3669       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3670         std::swap (dr_a, dr_b);
3671
3672       dr_with_seg_len_pair_t dr_with_seg_len_pair
3673         (dr_a, dr_b, (preserves_scalar_order_p
3674                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3675                       : dr_with_seg_len_pair_t::REORDERED));
3676
3677       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3678     }
3679
3680   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3681
3682   unsigned int count = (comp_alias_ddrs.length ()
3683                         + check_unequal_addrs.length ());
3684
3685   if (dump_enabled_p ())
3686     dump_printf_loc (MSG_NOTE, vect_location,
3687                      "improved number of alias checks from %d to %d\n",
3688                      may_alias_ddrs.length (), count);
3689   unsigned limit = param_vect_max_version_for_alias_checks;
3690   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3691     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3692   if (count > limit)
3693     return opt_result::failure_at
3694       (vect_location,
3695        "number of versioning for alias run-time tests exceeds %d "
3696        "(--param vect-max-version-for-alias-checks)\n", limit);
3697
3698   return opt_result::success ();
3699 }
3700
3701 /* Check whether we can use an internal function for a gather load
3702    or scatter store.  READ_P is true for loads and false for stores.
3703    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3704    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3705    is the type of the offset that is being applied to the invariant
3706    base address.  SCALE is the amount by which the offset should
3707    be multiplied *after* it has been converted to address width.
3708
3709    Return true if the function is supported, storing the function id in
3710    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3711
3712 bool
3713 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3714                           tree vectype, tree memory_type, tree offset_type,
3715                           int scale, internal_fn *ifn_out,
3716                           tree *offset_vectype_out)
3717 {
3718   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3719   unsigned int element_bits = vector_element_bits (vectype);
3720   if (element_bits != memory_bits)
3721     /* For now the vector elements must be the same width as the
3722        memory elements.  */
3723     return false;
3724
3725   /* Work out which function we need.  */
3726   internal_fn ifn;
3727   if (read_p)
3728     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3729   else
3730     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3731
3732   for (;;)
3733     {
3734       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3735       if (!offset_vectype)
3736         return false;
3737
3738       /* Test whether the target supports this combination.  */
3739       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3740                                                   offset_vectype, scale))
3741         {
3742           *ifn_out = ifn;
3743           *offset_vectype_out = offset_vectype;
3744           return true;
3745         }
3746
3747       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3748           && TYPE_PRECISION (offset_type) >= element_bits)
3749         return false;
3750
3751       offset_type = build_nonstandard_integer_type
3752         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3753     }
3754 }
3755
3756 /* STMT_INFO is a call to an internal gather load or scatter store function.
3757    Describe the operation in INFO.  */
3758
3759 static void
3760 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3761                                    gather_scatter_info *info)
3762 {
3763   gcall *call = as_a <gcall *> (stmt_info->stmt);
3764   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3765   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3766
3767   info->ifn = gimple_call_internal_fn (call);
3768   info->decl = NULL_TREE;
3769   info->base = gimple_call_arg (call, 0);
3770   info->offset = gimple_call_arg (call, 1);
3771   info->offset_dt = vect_unknown_def_type;
3772   info->offset_vectype = NULL_TREE;
3773   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3774   info->element_type = TREE_TYPE (vectype);
3775   info->memory_type = TREE_TYPE (DR_REF (dr));
3776 }
3777
3778 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3779    gather load or scatter store.  Describe the operation in *INFO if so.  */
3780
3781 bool
3782 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3783                            gather_scatter_info *info)
3784 {
3785   HOST_WIDE_INT scale = 1;
3786   poly_int64 pbitpos, pbitsize;
3787   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3788   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3789   tree offtype = NULL_TREE;
3790   tree decl = NULL_TREE, base, off;
3791   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3792   tree memory_type = TREE_TYPE (DR_REF (dr));
3793   machine_mode pmode;
3794   int punsignedp, reversep, pvolatilep = 0;
3795   internal_fn ifn;
3796   tree offset_vectype;
3797   bool masked_p = false;
3798
3799   /* See whether this is already a call to a gather/scatter internal function.
3800      If not, see whether it's a masked load or store.  */
3801   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3802   if (call && gimple_call_internal_p (call))
3803     {
3804       ifn = gimple_call_internal_fn (call);
3805       if (internal_gather_scatter_fn_p (ifn))
3806         {
3807           vect_describe_gather_scatter_call (stmt_info, info);
3808           return true;
3809         }
3810       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3811     }
3812
3813   /* True if we should aim to use internal functions rather than
3814      built-in functions.  */
3815   bool use_ifn_p = (DR_IS_READ (dr)
3816                     ? supports_vec_gather_load_p ()
3817                     : supports_vec_scatter_store_p ());
3818
3819   base = DR_REF (dr);
3820   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3821      see if we can use the def stmt of the address.  */
3822   if (masked_p
3823       && TREE_CODE (base) == MEM_REF
3824       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3825       && integer_zerop (TREE_OPERAND (base, 1))
3826       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3827     {
3828       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3829       if (is_gimple_assign (def_stmt)
3830           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3831         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3832     }
3833
3834   /* The gather and scatter builtins need address of the form
3835      loop_invariant + vector * {1, 2, 4, 8}
3836      or
3837      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3838      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3839      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3840      multiplications and additions in it.  To get a vector, we need
3841      a single SSA_NAME that will be defined in the loop and will
3842      contain everything that is not loop invariant and that can be
3843      vectorized.  The following code attempts to find such a preexistng
3844      SSA_NAME OFF and put the loop invariants into a tree BASE
3845      that can be gimplified before the loop.  */
3846   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3847                               &punsignedp, &reversep, &pvolatilep);
3848   if (reversep)
3849     return false;
3850
3851   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3852
3853   if (TREE_CODE (base) == MEM_REF)
3854     {
3855       if (!integer_zerop (TREE_OPERAND (base, 1)))
3856         {
3857           if (off == NULL_TREE)
3858             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3859           else
3860             off = size_binop (PLUS_EXPR, off,
3861                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3862         }
3863       base = TREE_OPERAND (base, 0);
3864     }
3865   else
3866     base = build_fold_addr_expr (base);
3867
3868   if (off == NULL_TREE)
3869     off = size_zero_node;
3870
3871   /* If base is not loop invariant, either off is 0, then we start with just
3872      the constant offset in the loop invariant BASE and continue with base
3873      as OFF, otherwise give up.
3874      We could handle that case by gimplifying the addition of base + off
3875      into some SSA_NAME and use that as off, but for now punt.  */
3876   if (!expr_invariant_in_loop_p (loop, base))
3877     {
3878       if (!integer_zerop (off))
3879         return false;
3880       off = base;
3881       base = size_int (pbytepos);
3882     }
3883   /* Otherwise put base + constant offset into the loop invariant BASE
3884      and continue with OFF.  */
3885   else
3886     {
3887       base = fold_convert (sizetype, base);
3888       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3889     }
3890
3891   /* OFF at this point may be either a SSA_NAME or some tree expression
3892      from get_inner_reference.  Try to peel off loop invariants from it
3893      into BASE as long as possible.  */
3894   STRIP_NOPS (off);
3895   while (offtype == NULL_TREE)
3896     {
3897       enum tree_code code;
3898       tree op0, op1, add = NULL_TREE;
3899
3900       if (TREE_CODE (off) == SSA_NAME)
3901         {
3902           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3903
3904           if (expr_invariant_in_loop_p (loop, off))
3905             return false;
3906
3907           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3908             break;
3909
3910           op0 = gimple_assign_rhs1 (def_stmt);
3911           code = gimple_assign_rhs_code (def_stmt);
3912           op1 = gimple_assign_rhs2 (def_stmt);
3913         }
3914       else
3915         {
3916           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3917             return false;
3918           code = TREE_CODE (off);
3919           extract_ops_from_tree (off, &code, &op0, &op1);
3920         }
3921       switch (code)
3922         {
3923         case POINTER_PLUS_EXPR:
3924         case PLUS_EXPR:
3925           if (expr_invariant_in_loop_p (loop, op0))
3926             {
3927               add = op0;
3928               off = op1;
3929             do_add:
3930               add = fold_convert (sizetype, add);
3931               if (scale != 1)
3932                 add = size_binop (MULT_EXPR, add, size_int (scale));
3933               base = size_binop (PLUS_EXPR, base, add);
3934               continue;
3935             }
3936           if (expr_invariant_in_loop_p (loop, op1))
3937             {
3938               add = op1;
3939               off = op0;
3940               goto do_add;
3941             }
3942           break;
3943         case MINUS_EXPR:
3944           if (expr_invariant_in_loop_p (loop, op1))
3945             {
3946               add = fold_convert (sizetype, op1);
3947               add = size_binop (MINUS_EXPR, size_zero_node, add);
3948               off = op0;
3949               goto do_add;
3950             }
3951           break;
3952         case MULT_EXPR:
3953           if (scale == 1 && tree_fits_shwi_p (op1))
3954             {
3955               int new_scale = tree_to_shwi (op1);
3956               /* Only treat this as a scaling operation if the target
3957                  supports it for at least some offset type.  */
3958               if (use_ifn_p
3959                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3960                                                 masked_p, vectype, memory_type,
3961                                                 signed_char_type_node,
3962                                                 new_scale, &ifn,
3963                                                 &offset_vectype)
3964                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3965                                                 masked_p, vectype, memory_type,
3966                                                 unsigned_char_type_node,
3967                                                 new_scale, &ifn,
3968                                                 &offset_vectype))
3969                 break;
3970               scale = new_scale;
3971               off = op0;
3972               continue;
3973             }
3974           break;
3975         case SSA_NAME:
3976           off = op0;
3977           continue;
3978         CASE_CONVERT:
3979           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3980               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3981             break;
3982
3983           /* Don't include the conversion if the target is happy with
3984              the current offset type.  */
3985           if (use_ifn_p
3986               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3987                                            masked_p, vectype, memory_type,
3988                                            TREE_TYPE (off), scale, &ifn,
3989                                            &offset_vectype))
3990             break;
3991
3992           if (TYPE_PRECISION (TREE_TYPE (op0))
3993               == TYPE_PRECISION (TREE_TYPE (off)))
3994             {
3995               off = op0;
3996               continue;
3997             }
3998
3999           if (TYPE_PRECISION (TREE_TYPE (op0))
4000               < TYPE_PRECISION (TREE_TYPE (off)))
4001             {
4002               off = op0;
4003               offtype = TREE_TYPE (off);
4004               STRIP_NOPS (off);
4005               continue;
4006             }
4007           break;
4008         default:
4009           break;
4010         }
4011       break;
4012     }
4013
4014   /* If at the end OFF still isn't a SSA_NAME or isn't
4015      defined in the loop, punt.  */
4016   if (TREE_CODE (off) != SSA_NAME
4017       || expr_invariant_in_loop_p (loop, off))
4018     return false;
4019
4020   if (offtype == NULL_TREE)
4021     offtype = TREE_TYPE (off);
4022
4023   if (use_ifn_p)
4024     {
4025       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4026                                      vectype, memory_type, offtype, scale,
4027                                      &ifn, &offset_vectype))
4028         return false;
4029     }
4030   else
4031     {
4032       if (DR_IS_READ (dr))
4033         {
4034           if (targetm.vectorize.builtin_gather)
4035             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4036         }
4037       else
4038         {
4039           if (targetm.vectorize.builtin_scatter)
4040             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4041         }
4042
4043       if (!decl)
4044         return false;
4045
4046       ifn = IFN_LAST;
4047       /* The offset vector type will be read from DECL when needed.  */
4048       offset_vectype = NULL_TREE;
4049     }
4050
4051   info->ifn = ifn;
4052   info->decl = decl;
4053   info->base = base;
4054   info->offset = off;
4055   info->offset_dt = vect_unknown_def_type;
4056   info->offset_vectype = offset_vectype;
4057   info->scale = scale;
4058   info->element_type = TREE_TYPE (vectype);
4059   info->memory_type = memory_type;
4060   return true;
4061 }
4062
4063 /* Find the data references in STMT, analyze them with respect to LOOP and
4064    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4065    be handled.  */
4066
4067 opt_result
4068 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4069                                vec<data_reference_p> *datarefs,
4070                                vec<int> *dataref_groups, int group_id)
4071 {
4072   /* We can ignore clobbers for dataref analysis - they are removed during
4073      loop vectorization and BB vectorization checks dependences with a
4074      stmt walk.  */
4075   if (gimple_clobber_p (stmt))
4076     return opt_result::success ();
4077
4078   if (gimple_has_volatile_ops (stmt))
4079     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4080                                    stmt);
4081
4082   if (stmt_can_throw_internal (cfun, stmt))
4083     return opt_result::failure_at (stmt,
4084                                    "not vectorized:"
4085                                    " statement can throw an exception: %G",
4086                                    stmt);
4087
4088   auto_vec<data_reference_p, 2> refs;
4089   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4090   if (!res)
4091     return res;
4092
4093   if (refs.is_empty ())
4094     return opt_result::success ();
4095
4096   if (refs.length () > 1)
4097     {
4098       while (!refs.is_empty ())
4099         free_data_ref (refs.pop ());
4100       return opt_result::failure_at (stmt,
4101                                      "not vectorized: more than one "
4102                                      "data ref in stmt: %G", stmt);
4103     }
4104
4105   data_reference_p dr = refs.pop ();
4106   if (gcall *call = dyn_cast <gcall *> (stmt))
4107     if (!gimple_call_internal_p (call)
4108         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4109             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4110       {
4111         free_data_ref (dr);
4112         return opt_result::failure_at (stmt,
4113                                        "not vectorized: dr in a call %G", stmt);
4114       }
4115
4116   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4117       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4118     {
4119       free_data_ref (dr);
4120       return opt_result::failure_at (stmt,
4121                                      "not vectorized:"
4122                                      " statement is bitfield access %G", stmt);
4123     }
4124
4125   if (DR_BASE_ADDRESS (dr)
4126       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4127     {
4128       free_data_ref (dr);
4129       return opt_result::failure_at (stmt,
4130                                      "not vectorized:"
4131                                      " base addr of dr is a constant\n");
4132     }
4133
4134   /* Check whether this may be a SIMD lane access and adjust the
4135      DR to make it easier for us to handle it.  */
4136   if (loop
4137       && loop->simduid
4138       && (!DR_BASE_ADDRESS (dr)
4139           || !DR_OFFSET (dr)
4140           || !DR_INIT (dr)
4141           || !DR_STEP (dr)))
4142     {
4143       struct data_reference *newdr
4144         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4145                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4146       if (DR_BASE_ADDRESS (newdr)
4147           && DR_OFFSET (newdr)
4148           && DR_INIT (newdr)
4149           && DR_STEP (newdr)
4150           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4151           && integer_zerop (DR_STEP (newdr)))
4152         {
4153           tree base_address = DR_BASE_ADDRESS (newdr);
4154           tree off = DR_OFFSET (newdr);
4155           tree step = ssize_int (1);
4156           if (integer_zerop (off)
4157               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4158             {
4159               off = TREE_OPERAND (base_address, 1);
4160               base_address = TREE_OPERAND (base_address, 0);
4161             }
4162           STRIP_NOPS (off);
4163           if (TREE_CODE (off) == MULT_EXPR
4164               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4165             {
4166               step = TREE_OPERAND (off, 1);
4167               off = TREE_OPERAND (off, 0);
4168               STRIP_NOPS (off);
4169             }
4170           if (CONVERT_EXPR_P (off)
4171               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4172                   < TYPE_PRECISION (TREE_TYPE (off))))
4173             off = TREE_OPERAND (off, 0);
4174           if (TREE_CODE (off) == SSA_NAME)
4175             {
4176               gimple *def = SSA_NAME_DEF_STMT (off);
4177               /* Look through widening conversion.  */
4178               if (is_gimple_assign (def)
4179                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4180                 {
4181                   tree rhs1 = gimple_assign_rhs1 (def);
4182                   if (TREE_CODE (rhs1) == SSA_NAME
4183                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4184                       && (TYPE_PRECISION (TREE_TYPE (off))
4185                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4186                     def = SSA_NAME_DEF_STMT (rhs1);
4187                 }
4188               if (is_gimple_call (def)
4189                   && gimple_call_internal_p (def)
4190                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4191                 {
4192                   tree arg = gimple_call_arg (def, 0);
4193                   tree reft = TREE_TYPE (DR_REF (newdr));
4194                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4195                   arg = SSA_NAME_VAR (arg);
4196                   if (arg == loop->simduid
4197                       /* For now.  */
4198                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4199                     {
4200                       DR_BASE_ADDRESS (newdr) = base_address;
4201                       DR_OFFSET (newdr) = ssize_int (0);
4202                       DR_STEP (newdr) = step;
4203                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4204                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4205                       /* Mark as simd-lane access.  */
4206                       tree arg2 = gimple_call_arg (def, 1);
4207                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4208                       free_data_ref (dr);
4209                       datarefs->safe_push (newdr);
4210                       if (dataref_groups)
4211                         dataref_groups->safe_push (group_id);
4212                       return opt_result::success ();
4213                     }
4214                 }
4215             }
4216         }
4217       free_data_ref (newdr);
4218     }
4219
4220   datarefs->safe_push (dr);
4221   if (dataref_groups)
4222     dataref_groups->safe_push (group_id);
4223   return opt_result::success ();
4224 }
4225
4226 /* Function vect_analyze_data_refs.
4227
4228   Find all the data references in the loop or basic block.
4229
4230    The general structure of the analysis of data refs in the vectorizer is as
4231    follows:
4232    1- vect_analyze_data_refs(loop/bb): call
4233       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4234       in the loop/bb and their dependences.
4235    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4236    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4237    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4238
4239 */
4240
4241 opt_result
4242 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4243 {
4244   class loop *loop = NULL;
4245   unsigned int i;
4246   struct data_reference *dr;
4247   tree scalar_type;
4248
4249   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4250
4251   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4252     loop = LOOP_VINFO_LOOP (loop_vinfo);
4253
4254   /* Go through the data-refs, check that the analysis succeeded.  Update
4255      pointer from stmt_vec_info struct to DR and vectype.  */
4256
4257   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4258   FOR_EACH_VEC_ELT (datarefs, i, dr)
4259     {
4260       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4261       poly_uint64 vf;
4262
4263       gcc_assert (DR_REF (dr));
4264       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4265       gcc_assert (!stmt_info->dr_aux.dr);
4266       stmt_info->dr_aux.dr = dr;
4267       stmt_info->dr_aux.stmt = stmt_info;
4268
4269       /* Check that analysis of the data-ref succeeded.  */
4270       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4271           || !DR_STEP (dr))
4272         {
4273           bool maybe_gather
4274             = DR_IS_READ (dr)
4275               && !TREE_THIS_VOLATILE (DR_REF (dr))
4276               && (targetm.vectorize.builtin_gather != NULL
4277                   || supports_vec_gather_load_p ());
4278           bool maybe_scatter
4279             = DR_IS_WRITE (dr)
4280               && !TREE_THIS_VOLATILE (DR_REF (dr))
4281               && (targetm.vectorize.builtin_scatter != NULL
4282                   || supports_vec_scatter_store_p ());
4283
4284           /* If target supports vector gather loads or scatter stores,
4285              see if they can't be used.  */
4286           if (is_a <loop_vec_info> (vinfo)
4287               && !nested_in_vect_loop_p (loop, stmt_info))
4288             {
4289               if (maybe_gather || maybe_scatter)
4290                 {
4291                   if (maybe_gather)
4292                     gatherscatter = GATHER;
4293                   else
4294                     gatherscatter = SCATTER;
4295                 }
4296             }
4297
4298           if (gatherscatter == SG_NONE)
4299             {
4300               if (dump_enabled_p ())
4301                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4302                                  "not vectorized: data ref analysis "
4303                                  "failed %G", stmt_info->stmt);
4304               if (is_a <bb_vec_info> (vinfo))
4305                 {
4306                   /* In BB vectorization the ref can still participate
4307                      in dependence analysis, we just can't vectorize it.  */
4308                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4309                   continue;
4310                 }
4311               return opt_result::failure_at (stmt_info->stmt,
4312                                              "not vectorized:"
4313                                              " data ref analysis failed: %G",
4314                                              stmt_info->stmt);
4315             }
4316         }
4317
4318       /* See if this was detected as SIMD lane access.  */
4319       if (dr->aux == (void *)-1
4320           || dr->aux == (void *)-2
4321           || dr->aux == (void *)-3
4322           || dr->aux == (void *)-4)
4323         {
4324           if (nested_in_vect_loop_p (loop, stmt_info))
4325             return opt_result::failure_at (stmt_info->stmt,
4326                                            "not vectorized:"
4327                                            " data ref analysis failed: %G",
4328                                            stmt_info->stmt);
4329           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4330             = -(uintptr_t) dr->aux;
4331         }
4332
4333       tree base = get_base_address (DR_REF (dr));
4334       if (base && VAR_P (base) && DECL_NONALIASED (base))
4335         {
4336           if (dump_enabled_p ())
4337             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4338                              "not vectorized: base object not addressable "
4339                              "for stmt: %G", stmt_info->stmt);
4340           if (is_a <bb_vec_info> (vinfo))
4341             {
4342               /* In BB vectorization the ref can still participate
4343                  in dependence analysis, we just can't vectorize it.  */
4344               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4345               continue;
4346             }
4347           return opt_result::failure_at (stmt_info->stmt,
4348                                          "not vectorized: base object not"
4349                                          " addressable for stmt: %G",
4350                                          stmt_info->stmt);
4351         }
4352
4353       if (is_a <loop_vec_info> (vinfo)
4354           && DR_STEP (dr)
4355           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4356         {
4357           if (nested_in_vect_loop_p (loop, stmt_info))
4358             return opt_result::failure_at (stmt_info->stmt,
4359                                            "not vectorized: "
4360                                            "not suitable for strided load %G",
4361                                            stmt_info->stmt);
4362           STMT_VINFO_STRIDED_P (stmt_info) = true;
4363         }
4364
4365       /* Update DR field in stmt_vec_info struct.  */
4366
4367       /* If the dataref is in an inner-loop of the loop that is considered for
4368          for vectorization, we also want to analyze the access relative to
4369          the outer-loop (DR contains information only relative to the
4370          inner-most enclosing loop).  We do that by building a reference to the
4371          first location accessed by the inner-loop, and analyze it relative to
4372          the outer-loop.  */
4373       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4374         {
4375           /* Build a reference to the first location accessed by the
4376              inner loop: *(BASE + INIT + OFFSET).  By construction,
4377              this address must be invariant in the inner loop, so we
4378              can consider it as being used in the outer loop.  */
4379           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4380           tree offset = unshare_expr (DR_OFFSET (dr));
4381           tree init = unshare_expr (DR_INIT (dr));
4382           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4383                                           init, offset);
4384           tree init_addr = fold_build_pointer_plus (base, init_offset);
4385           tree init_ref = build_fold_indirect_ref (init_addr);
4386
4387           if (dump_enabled_p ())
4388             dump_printf_loc (MSG_NOTE, vect_location,
4389                              "analyze in outer loop: %T\n", init_ref);
4390
4391           opt_result res
4392             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4393                                     init_ref, loop, stmt_info->stmt);
4394           if (!res)
4395             /* dr_analyze_innermost already explained the failure.  */
4396             return res;
4397
4398           if (dump_enabled_p ())
4399             dump_printf_loc (MSG_NOTE, vect_location,
4400                              "\touter base_address: %T\n"
4401                              "\touter offset from base address: %T\n"
4402                              "\touter constant offset from base address: %T\n"
4403                              "\touter step: %T\n"
4404                              "\touter base alignment: %d\n\n"
4405                              "\touter base misalignment: %d\n"
4406                              "\touter offset alignment: %d\n"
4407                              "\touter step alignment: %d\n",
4408                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4409                              STMT_VINFO_DR_OFFSET (stmt_info),
4410                              STMT_VINFO_DR_INIT (stmt_info),
4411                              STMT_VINFO_DR_STEP (stmt_info),
4412                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4413                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4414                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4415                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4416         }
4417
4418       /* Set vectype for STMT.  */
4419       scalar_type = TREE_TYPE (DR_REF (dr));
4420       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4421       if (!vectype)
4422         {
4423           if (dump_enabled_p ())
4424             {
4425               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4426                                "not vectorized: no vectype for stmt: %G",
4427                                stmt_info->stmt);
4428               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4429               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4430                                  scalar_type);
4431               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4432             }
4433
4434           if (is_a <bb_vec_info> (vinfo))
4435             {
4436               /* No vector type is fine, the ref can still participate
4437                  in dependence analysis, we just can't vectorize it.  */
4438               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4439               continue;
4440             }
4441           if (fatal)
4442             *fatal = false;
4443           return opt_result::failure_at (stmt_info->stmt,
4444                                          "not vectorized:"
4445                                          " no vectype for stmt: %G"
4446                                          " scalar_type: %T\n",
4447                                          stmt_info->stmt, scalar_type);
4448         }
4449       else
4450         {
4451           if (dump_enabled_p ())
4452             dump_printf_loc (MSG_NOTE, vect_location,
4453                              "got vectype for stmt: %G%T\n",
4454                              stmt_info->stmt, vectype);
4455         }
4456
4457       /* Adjust the minimal vectorization factor according to the
4458          vector type.  */
4459       vf = TYPE_VECTOR_SUBPARTS (vectype);
4460       *min_vf = upper_bound (*min_vf, vf);
4461
4462       /* Leave the BB vectorizer to pick the vector type later, based on
4463          the final dataref group size and SLP node size.  */
4464       if (is_a <loop_vec_info> (vinfo))
4465         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4466
4467       if (gatherscatter != SG_NONE)
4468         {
4469           gather_scatter_info gs_info;
4470           if (!vect_check_gather_scatter (stmt_info,
4471                                           as_a <loop_vec_info> (vinfo),
4472                                           &gs_info)
4473               || !get_vectype_for_scalar_type (vinfo,
4474                                                TREE_TYPE (gs_info.offset)))
4475             {
4476               if (fatal)
4477                 *fatal = false;
4478               return opt_result::failure_at
4479                         (stmt_info->stmt,
4480                          (gatherscatter == GATHER)
4481                          ? "not vectorized: not suitable for gather load %G"
4482                          : "not vectorized: not suitable for scatter store %G",
4483                          stmt_info->stmt);
4484             }
4485           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4486         }
4487     }
4488
4489   /* We used to stop processing and prune the list here.  Verify we no
4490      longer need to.  */
4491   gcc_assert (i == datarefs.length ());
4492
4493   return opt_result::success ();
4494 }
4495
4496
4497 /* Function vect_get_new_vect_var.
4498
4499    Returns a name for a new variable.  The current naming scheme appends the
4500    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4501    the name of vectorizer generated variables, and appends that to NAME if
4502    provided.  */
4503
4504 tree
4505 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4506 {
4507   const char *prefix;
4508   tree new_vect_var;
4509
4510   switch (var_kind)
4511   {
4512   case vect_simple_var:
4513     prefix = "vect";
4514     break;
4515   case vect_scalar_var:
4516     prefix = "stmp";
4517     break;
4518   case vect_mask_var:
4519     prefix = "mask";
4520     break;
4521   case vect_pointer_var:
4522     prefix = "vectp";
4523     break;
4524   default:
4525     gcc_unreachable ();
4526   }
4527
4528   if (name)
4529     {
4530       char* tmp = concat (prefix, "_", name, NULL);
4531       new_vect_var = create_tmp_reg (type, tmp);
4532       free (tmp);
4533     }
4534   else
4535     new_vect_var = create_tmp_reg (type, prefix);
4536
4537   return new_vect_var;
4538 }
4539
4540 /* Like vect_get_new_vect_var but return an SSA name.  */
4541
4542 tree
4543 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4544 {
4545   const char *prefix;
4546   tree new_vect_var;
4547
4548   switch (var_kind)
4549   {
4550   case vect_simple_var:
4551     prefix = "vect";
4552     break;
4553   case vect_scalar_var:
4554     prefix = "stmp";
4555     break;
4556   case vect_pointer_var:
4557     prefix = "vectp";
4558     break;
4559   default:
4560     gcc_unreachable ();
4561   }
4562
4563   if (name)
4564     {
4565       char* tmp = concat (prefix, "_", name, NULL);
4566       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4567       free (tmp);
4568     }
4569   else
4570     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4571
4572   return new_vect_var;
4573 }
4574
4575 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4576
4577 static void
4578 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4579 {
4580   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4581   int misalign = DR_MISALIGNMENT (dr_info);
4582   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4583     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4584   else
4585     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4586                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4587                             misalign);
4588 }
4589
4590 /* Function vect_create_addr_base_for_vector_ref.
4591
4592    Create an expression that computes the address of the first memory location
4593    that will be accessed for a data reference.
4594
4595    Input:
4596    STMT_INFO: The statement containing the data reference.
4597    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4598    OFFSET: Optional. If supplied, it is be added to the initial address.
4599    LOOP:    Specify relative to which loop-nest should the address be computed.
4600             For example, when the dataref is in an inner-loop nested in an
4601             outer-loop that is now being vectorized, LOOP can be either the
4602             outer-loop, or the inner-loop.  The first memory location accessed
4603             by the following dataref ('in' points to short):
4604
4605                 for (i=0; i<N; i++)
4606                    for (j=0; j<M; j++)
4607                      s += in[i+j]
4608
4609             is as follows:
4610             if LOOP=i_loop:     &in             (relative to i_loop)
4611             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4612    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4613             initial address.  Unlike OFFSET, which is number of elements to
4614             be added, BYTE_OFFSET is measured in bytes.
4615
4616    Output:
4617    1. Return an SSA_NAME whose value is the address of the memory location of
4618       the first vector of the data reference.
4619    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4620       these statement(s) which define the returned SSA_NAME.
4621
4622    FORNOW: We are only handling array accesses with step 1.  */
4623
4624 tree
4625 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4626                                       gimple_seq *new_stmt_list,
4627                                       tree offset,
4628                                       tree byte_offset)
4629 {
4630   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4631   struct data_reference *dr = dr_info->dr;
4632   const char *base_name;
4633   tree addr_base;
4634   tree dest;
4635   gimple_seq seq = NULL;
4636   tree vect_ptr_type;
4637   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4638   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4639   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4640
4641   tree data_ref_base = unshare_expr (drb->base_address);
4642   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4643   tree init = unshare_expr (drb->init);
4644
4645   if (loop_vinfo)
4646     base_name = get_name (data_ref_base);
4647   else
4648     {
4649       base_offset = ssize_int (0);
4650       init = ssize_int (0);
4651       base_name = get_name (DR_REF (dr));
4652     }
4653
4654   /* Create base_offset */
4655   base_offset = size_binop (PLUS_EXPR,
4656                             fold_convert (sizetype, base_offset),
4657                             fold_convert (sizetype, init));
4658
4659   if (offset)
4660     {
4661       offset = fold_build2 (MULT_EXPR, sizetype,
4662                             fold_convert (sizetype, offset), step);
4663       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4664                                  base_offset, offset);
4665     }
4666   if (byte_offset)
4667     {
4668       byte_offset = fold_convert (sizetype, byte_offset);
4669       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4670                                  base_offset, byte_offset);
4671     }
4672
4673   /* base + base_offset */
4674   if (loop_vinfo)
4675     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4676   else
4677     {
4678       addr_base = build1 (ADDR_EXPR,
4679                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4680                           unshare_expr (DR_REF (dr)));
4681     }
4682
4683   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4684   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4685   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4686   gimple_seq_add_seq (new_stmt_list, seq);
4687
4688   if (DR_PTR_INFO (dr)
4689       && TREE_CODE (addr_base) == SSA_NAME
4690       && !SSA_NAME_PTR_INFO (addr_base))
4691     {
4692       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4693       if (offset || byte_offset)
4694         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4695     }
4696
4697   if (dump_enabled_p ())
4698     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4699
4700   return addr_base;
4701 }
4702
4703
4704 /* Function vect_create_data_ref_ptr.
4705
4706    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4707    location accessed in the loop by STMT_INFO, along with the def-use update
4708    chain to appropriately advance the pointer through the loop iterations.
4709    Also set aliasing information for the pointer.  This pointer is used by
4710    the callers to this function to create a memory reference expression for
4711    vector load/store access.
4712
4713    Input:
4714    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4715          GIMPLE_ASSIGN <name, data-ref> or
4716          GIMPLE_ASSIGN <data-ref, name>.
4717    2. AGGR_TYPE: the type of the reference, which should be either a vector
4718         or an array.
4719    3. AT_LOOP: the loop where the vector memref is to be created.
4720    4. OFFSET (optional): an offset to be added to the initial address accessed
4721         by the data-ref in STMT_INFO.
4722    5. BSI: location where the new stmts are to be placed if there is no loop
4723    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4724         pointing to the initial address.
4725    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4726         to the initial address accessed by the data-ref in STMT_INFO.  This is
4727         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4728         in bytes.
4729    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4730         to the IV during each iteration of the loop.  NULL says to move
4731         by one copy of AGGR_TYPE up or down, depending on the step of the
4732         data reference.
4733
4734    Output:
4735    1. Declare a new ptr to vector_type, and have it point to the base of the
4736       data reference (initial addressed accessed by the data reference).
4737       For example, for vector of type V8HI, the following code is generated:
4738
4739       v8hi *ap;
4740       ap = (v8hi *)initial_address;
4741
4742       if OFFSET is not supplied:
4743          initial_address = &a[init];
4744       if OFFSET is supplied:
4745          initial_address = &a[init + OFFSET];
4746       if BYTE_OFFSET is supplied:
4747          initial_address = &a[init] + BYTE_OFFSET;
4748
4749       Return the initial_address in INITIAL_ADDRESS.
4750
4751    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4752       update the pointer in each iteration of the loop.
4753
4754       Return the increment stmt that updates the pointer in PTR_INCR.
4755
4756    3. Return the pointer.  */
4757
4758 tree
4759 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4760                           tree aggr_type, class loop *at_loop, tree offset,
4761                           tree *initial_address, gimple_stmt_iterator *gsi,
4762                           gimple **ptr_incr, bool only_init,
4763                           tree byte_offset, tree iv_step)
4764 {
4765   const char *base_name;
4766   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4767   class loop *loop = NULL;
4768   bool nested_in_vect_loop = false;
4769   class loop *containing_loop = NULL;
4770   tree aggr_ptr_type;
4771   tree aggr_ptr;
4772   tree new_temp;
4773   gimple_seq new_stmt_list = NULL;
4774   edge pe = NULL;
4775   basic_block new_bb;
4776   tree aggr_ptr_init;
4777   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4778   struct data_reference *dr = dr_info->dr;
4779   tree aptr;
4780   gimple_stmt_iterator incr_gsi;
4781   bool insert_after;
4782   tree indx_before_incr, indx_after_incr;
4783   gimple *incr;
4784   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4785
4786   gcc_assert (iv_step != NULL_TREE
4787               || TREE_CODE (aggr_type) == ARRAY_TYPE
4788               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4789
4790   if (loop_vinfo)
4791     {
4792       loop = LOOP_VINFO_LOOP (loop_vinfo);
4793       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4794       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4795       pe = loop_preheader_edge (loop);
4796     }
4797   else
4798     {
4799       gcc_assert (bb_vinfo);
4800       only_init = true;
4801       *ptr_incr = NULL;
4802     }
4803
4804   /* Create an expression for the first address accessed by this load
4805      in LOOP.  */
4806   base_name = get_name (DR_BASE_ADDRESS (dr));
4807
4808   if (dump_enabled_p ())
4809     {
4810       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4811       dump_printf_loc (MSG_NOTE, vect_location,
4812                        "create %s-pointer variable to type: %T",
4813                        get_tree_code_name (TREE_CODE (aggr_type)),
4814                        aggr_type);
4815       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4816         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4817       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4818         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4819       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4820         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4821       else
4822         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4823       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4824     }
4825
4826   /* (1) Create the new aggregate-pointer variable.
4827      Vector and array types inherit the alias set of their component
4828      type by default so we need to use a ref-all pointer if the data
4829      reference does not conflict with the created aggregated data
4830      reference because it is not addressable.  */
4831   bool need_ref_all = false;
4832   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4833                               get_alias_set (DR_REF (dr))))
4834     need_ref_all = true;
4835   /* Likewise for any of the data references in the stmt group.  */
4836   else if (DR_GROUP_SIZE (stmt_info) > 1)
4837     {
4838       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4839       do
4840         {
4841           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4842           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4843                                       get_alias_set (DR_REF (sdr))))
4844             {
4845               need_ref_all = true;
4846               break;
4847             }
4848           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4849         }
4850       while (sinfo);
4851     }
4852   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4853                                                need_ref_all);
4854   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4855
4856
4857   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4858      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4859      def-use update cycles for the pointer: one relative to the outer-loop
4860      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4861      to the inner-loop (which is the inner-most loop containing the dataref),
4862      and this is done be step (5) below.
4863
4864      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4865      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4866      redundant.  Steps (3),(4) create the following:
4867
4868         vp0 = &base_addr;
4869         LOOP:   vp1 = phi(vp0,vp2)
4870                 ...
4871                 ...
4872                 vp2 = vp1 + step
4873                 goto LOOP
4874
4875      If there is an inner-loop nested in loop, then step (5) will also be
4876      applied, and an additional update in the inner-loop will be created:
4877
4878         vp0 = &base_addr;
4879         LOOP:   vp1 = phi(vp0,vp2)
4880                 ...
4881         inner:     vp3 = phi(vp1,vp4)
4882                    vp4 = vp3 + inner_step
4883                    if () goto inner
4884                 ...
4885                 vp2 = vp1 + step
4886                 if () goto LOOP   */
4887
4888   /* (2) Calculate the initial address of the aggregate-pointer, and set
4889      the aggregate-pointer to point to it before the loop.  */
4890
4891   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4892
4893   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4894                                                    stmt_info, &new_stmt_list,
4895                                                    offset, byte_offset);
4896   if (new_stmt_list)
4897     {
4898       if (pe)
4899         {
4900           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4901           gcc_assert (!new_bb);
4902         }
4903       else
4904         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4905     }
4906
4907   *initial_address = new_temp;
4908   aggr_ptr_init = new_temp;
4909
4910   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4911      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4912      inner-loop nested in LOOP (during outer-loop vectorization).  */
4913
4914   /* No update in loop is required.  */
4915   if (only_init && (!loop_vinfo || at_loop == loop))
4916     aptr = aggr_ptr_init;
4917   else
4918     {
4919       /* Accesses to invariant addresses should be handled specially
4920          by the caller.  */
4921       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4922       gcc_assert (!integer_zerop (step));
4923
4924       if (iv_step == NULL_TREE)
4925         {
4926           /* The step of the aggregate pointer is the type size,
4927              negated for downward accesses.  */
4928           iv_step = TYPE_SIZE_UNIT (aggr_type);
4929           if (tree_int_cst_sgn (step) == -1)
4930             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4931         }
4932
4933       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4934
4935       create_iv (aggr_ptr_init,
4936                  fold_convert (aggr_ptr_type, iv_step),
4937                  aggr_ptr, loop, &incr_gsi, insert_after,
4938                  &indx_before_incr, &indx_after_incr);
4939       incr = gsi_stmt (incr_gsi);
4940
4941       /* Copy the points-to information if it exists. */
4942       if (DR_PTR_INFO (dr))
4943         {
4944           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4945           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4946         }
4947       if (ptr_incr)
4948         *ptr_incr = incr;
4949
4950       aptr = indx_before_incr;
4951     }
4952
4953   if (!nested_in_vect_loop || only_init)
4954     return aptr;
4955
4956
4957   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4958      nested in LOOP, if exists.  */
4959
4960   gcc_assert (nested_in_vect_loop);
4961   if (!only_init)
4962     {
4963       standard_iv_increment_position (containing_loop, &incr_gsi,
4964                                       &insert_after);
4965       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4966                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4967                  &indx_after_incr);
4968       incr = gsi_stmt (incr_gsi);
4969
4970       /* Copy the points-to information if it exists. */
4971       if (DR_PTR_INFO (dr))
4972         {
4973           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4974           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4975         }
4976       if (ptr_incr)
4977         *ptr_incr = incr;
4978
4979       return indx_before_incr;
4980     }
4981   else
4982     gcc_unreachable ();
4983 }
4984
4985
4986 /* Function bump_vector_ptr
4987
4988    Increment a pointer (to a vector type) by vector-size. If requested,
4989    i.e. if PTR-INCR is given, then also connect the new increment stmt
4990    to the existing def-use update-chain of the pointer, by modifying
4991    the PTR_INCR as illustrated below:
4992
4993    The pointer def-use update-chain before this function:
4994                         DATAREF_PTR = phi (p_0, p_2)
4995                         ....
4996         PTR_INCR:       p_2 = DATAREF_PTR + step
4997
4998    The pointer def-use update-chain after this function:
4999                         DATAREF_PTR = phi (p_0, p_2)
5000                         ....
5001                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5002                         ....
5003         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5004
5005    Input:
5006    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5007                  in the loop.
5008    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5009               the loop.  The increment amount across iterations is expected
5010               to be vector_size.
5011    BSI - location where the new update stmt is to be placed.
5012    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5013    BUMP - optional. The offset by which to bump the pointer. If not given,
5014           the offset is assumed to be vector_size.
5015
5016    Output: Return NEW_DATAREF_PTR as illustrated above.
5017
5018 */
5019
5020 tree
5021 bump_vector_ptr (vec_info *vinfo,
5022                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5023                  stmt_vec_info stmt_info, tree bump)
5024 {
5025   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5026   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5027   tree update = TYPE_SIZE_UNIT (vectype);
5028   gassign *incr_stmt;
5029   ssa_op_iter iter;
5030   use_operand_p use_p;
5031   tree new_dataref_ptr;
5032
5033   if (bump)
5034     update = bump;
5035
5036   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5037     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5038   else
5039     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5040   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5041                                    dataref_ptr, update);
5042   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5043
5044   /* Copy the points-to information if it exists. */
5045   if (DR_PTR_INFO (dr))
5046     {
5047       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5048       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5049     }
5050
5051   if (!ptr_incr)
5052     return new_dataref_ptr;
5053
5054   /* Update the vector-pointer's cross-iteration increment.  */
5055   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5056     {
5057       tree use = USE_FROM_PTR (use_p);
5058
5059       if (use == dataref_ptr)
5060         SET_USE (use_p, new_dataref_ptr);
5061       else
5062         gcc_assert (operand_equal_p (use, update, 0));
5063     }
5064
5065   return new_dataref_ptr;
5066 }
5067
5068
5069 /* Copy memory reference info such as base/clique from the SRC reference
5070    to the DEST MEM_REF.  */
5071
5072 void
5073 vect_copy_ref_info (tree dest, tree src)
5074 {
5075   if (TREE_CODE (dest) != MEM_REF)
5076     return;
5077
5078   tree src_base = src;
5079   while (handled_component_p (src_base))
5080     src_base = TREE_OPERAND (src_base, 0);
5081   if (TREE_CODE (src_base) != MEM_REF
5082       && TREE_CODE (src_base) != TARGET_MEM_REF)
5083     return;
5084
5085   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5086   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5087 }
5088
5089
5090 /* Function vect_create_destination_var.
5091
5092    Create a new temporary of type VECTYPE.  */
5093
5094 tree
5095 vect_create_destination_var (tree scalar_dest, tree vectype)
5096 {
5097   tree vec_dest;
5098   const char *name;
5099   char *new_name;
5100   tree type;
5101   enum vect_var_kind kind;
5102
5103   kind = vectype
5104     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5105     ? vect_mask_var
5106     : vect_simple_var
5107     : vect_scalar_var;
5108   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5109
5110   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5111
5112   name = get_name (scalar_dest);
5113   if (name)
5114     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5115   else
5116     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5117   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5118   free (new_name);
5119
5120   return vec_dest;
5121 }
5122
5123 /* Function vect_grouped_store_supported.
5124
5125    Returns TRUE if interleave high and interleave low permutations
5126    are supported, and FALSE otherwise.  */
5127
5128 bool
5129 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5130 {
5131   machine_mode mode = TYPE_MODE (vectype);
5132
5133   /* vect_permute_store_chain requires the group size to be equal to 3 or
5134      be a power of two.  */
5135   if (count != 3 && exact_log2 (count) == -1)
5136     {
5137       if (dump_enabled_p ())
5138         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5139                          "the size of the group of accesses"
5140                          " is not a power of 2 or not eqaul to 3\n");
5141       return false;
5142     }
5143
5144   /* Check that the permutation is supported.  */
5145   if (VECTOR_MODE_P (mode))
5146     {
5147       unsigned int i;
5148       if (count == 3)
5149         {
5150           unsigned int j0 = 0, j1 = 0, j2 = 0;
5151           unsigned int i, j;
5152
5153           unsigned int nelt;
5154           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5155             {
5156               if (dump_enabled_p ())
5157                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158                                  "cannot handle groups of 3 stores for"
5159                                  " variable-length vectors\n");
5160               return false;
5161             }
5162
5163           vec_perm_builder sel (nelt, nelt, 1);
5164           sel.quick_grow (nelt);
5165           vec_perm_indices indices;
5166           for (j = 0; j < 3; j++)
5167             {
5168               int nelt0 = ((3 - j) * nelt) % 3;
5169               int nelt1 = ((3 - j) * nelt + 1) % 3;
5170               int nelt2 = ((3 - j) * nelt + 2) % 3;
5171               for (i = 0; i < nelt; i++)
5172                 {
5173                   if (3 * i + nelt0 < nelt)
5174                     sel[3 * i + nelt0] = j0++;
5175                   if (3 * i + nelt1 < nelt)
5176                     sel[3 * i + nelt1] = nelt + j1++;
5177                   if (3 * i + nelt2 < nelt)
5178                     sel[3 * i + nelt2] = 0;
5179                 }
5180               indices.new_vector (sel, 2, nelt);
5181               if (!can_vec_perm_const_p (mode, indices))
5182                 {
5183                   if (dump_enabled_p ())
5184                     dump_printf (MSG_MISSED_OPTIMIZATION,
5185                                  "permutation op not supported by target.\n");
5186                   return false;
5187                 }
5188
5189               for (i = 0; i < nelt; i++)
5190                 {
5191                   if (3 * i + nelt0 < nelt)
5192                     sel[3 * i + nelt0] = 3 * i + nelt0;
5193                   if (3 * i + nelt1 < nelt)
5194                     sel[3 * i + nelt1] = 3 * i + nelt1;
5195                   if (3 * i + nelt2 < nelt)
5196                     sel[3 * i + nelt2] = nelt + j2++;
5197                 }
5198               indices.new_vector (sel, 2, nelt);
5199               if (!can_vec_perm_const_p (mode, indices))
5200                 {
5201                   if (dump_enabled_p ())
5202                     dump_printf (MSG_MISSED_OPTIMIZATION,
5203                                  "permutation op not supported by target.\n");
5204                   return false;
5205                 }
5206             }
5207           return true;
5208         }
5209       else
5210         {
5211           /* If length is not equal to 3 then only power of 2 is supported.  */
5212           gcc_assert (pow2p_hwi (count));
5213           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5214
5215           /* The encoding has 2 interleaved stepped patterns.  */
5216           vec_perm_builder sel (nelt, 2, 3);
5217           sel.quick_grow (6);
5218           for (i = 0; i < 3; i++)
5219             {
5220               sel[i * 2] = i;
5221               sel[i * 2 + 1] = i + nelt;
5222             }
5223           vec_perm_indices indices (sel, 2, nelt);
5224           if (can_vec_perm_const_p (mode, indices))
5225             {
5226               for (i = 0; i < 6; i++)
5227                 sel[i] += exact_div (nelt, 2);
5228               indices.new_vector (sel, 2, nelt);
5229               if (can_vec_perm_const_p (mode, indices))
5230                 return true;
5231             }
5232         }
5233     }
5234
5235   if (dump_enabled_p ())
5236     dump_printf (MSG_MISSED_OPTIMIZATION,
5237                  "permutation op not supported by target.\n");
5238   return false;
5239 }
5240
5241
5242 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5243    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5244
5245 bool
5246 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5247                             bool masked_p)
5248 {
5249   if (masked_p)
5250     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5251                                          vec_mask_store_lanes_optab,
5252                                          vectype, count);
5253   else
5254     return vect_lanes_optab_supported_p ("vec_store_lanes",
5255                                          vec_store_lanes_optab,
5256                                          vectype, count);
5257 }
5258
5259
5260 /* Function vect_permute_store_chain.
5261
5262    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5263    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5264    the data correctly for the stores.  Return the final references for stores
5265    in RESULT_CHAIN.
5266
5267    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5268    The input is 4 vectors each containing 8 elements.  We assign a number to
5269    each element, the input sequence is:
5270
5271    1st vec:   0  1  2  3  4  5  6  7
5272    2nd vec:   8  9 10 11 12 13 14 15
5273    3rd vec:  16 17 18 19 20 21 22 23
5274    4th vec:  24 25 26 27 28 29 30 31
5275
5276    The output sequence should be:
5277
5278    1st vec:  0  8 16 24  1  9 17 25
5279    2nd vec:  2 10 18 26  3 11 19 27
5280    3rd vec:  4 12 20 28  5 13 21 30
5281    4th vec:  6 14 22 30  7 15 23 31
5282
5283    i.e., we interleave the contents of the four vectors in their order.
5284
5285    We use interleave_high/low instructions to create such output.  The input of
5286    each interleave_high/low operation is two vectors:
5287    1st vec    2nd vec
5288    0 1 2 3    4 5 6 7
5289    the even elements of the result vector are obtained left-to-right from the
5290    high/low elements of the first vector.  The odd elements of the result are
5291    obtained left-to-right from the high/low elements of the second vector.
5292    The output of interleave_high will be:   0 4 1 5
5293    and of interleave_low:                   2 6 3 7
5294
5295
5296    The permutation is done in log LENGTH stages.  In each stage interleave_high
5297    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5298    where the first argument is taken from the first half of DR_CHAIN and the
5299    second argument from it's second half.
5300    In our example,
5301
5302    I1: interleave_high (1st vec, 3rd vec)
5303    I2: interleave_low (1st vec, 3rd vec)
5304    I3: interleave_high (2nd vec, 4th vec)
5305    I4: interleave_low (2nd vec, 4th vec)
5306
5307    The output for the first stage is:
5308
5309    I1:  0 16  1 17  2 18  3 19
5310    I2:  4 20  5 21  6 22  7 23
5311    I3:  8 24  9 25 10 26 11 27
5312    I4: 12 28 13 29 14 30 15 31
5313
5314    The output of the second stage, i.e. the final result is:
5315
5316    I1:  0  8 16 24  1  9 17 25
5317    I2:  2 10 18 26  3 11 19 27
5318    I3:  4 12 20 28  5 13 21 30
5319    I4:  6 14 22 30  7 15 23 31.  */
5320
5321 void
5322 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5323                           unsigned int length,
5324                           stmt_vec_info stmt_info,
5325                           gimple_stmt_iterator *gsi,
5326                           vec<tree> *result_chain)
5327 {
5328   tree vect1, vect2, high, low;
5329   gimple *perm_stmt;
5330   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5331   tree perm_mask_low, perm_mask_high;
5332   tree data_ref;
5333   tree perm3_mask_low, perm3_mask_high;
5334   unsigned int i, j, n, log_length = exact_log2 (length);
5335
5336   result_chain->quick_grow (length);
5337   memcpy (result_chain->address (), dr_chain.address (),
5338           length * sizeof (tree));
5339
5340   if (length == 3)
5341     {
5342       /* vect_grouped_store_supported ensures that this is constant.  */
5343       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5344       unsigned int j0 = 0, j1 = 0, j2 = 0;
5345
5346       vec_perm_builder sel (nelt, nelt, 1);
5347       sel.quick_grow (nelt);
5348       vec_perm_indices indices;
5349       for (j = 0; j < 3; j++)
5350         {
5351           int nelt0 = ((3 - j) * nelt) % 3;
5352           int nelt1 = ((3 - j) * nelt + 1) % 3;
5353           int nelt2 = ((3 - j) * nelt + 2) % 3;
5354
5355           for (i = 0; i < nelt; i++)
5356             {
5357               if (3 * i + nelt0 < nelt)
5358                 sel[3 * i + nelt0] = j0++;
5359               if (3 * i + nelt1 < nelt)
5360                 sel[3 * i + nelt1] = nelt + j1++;
5361               if (3 * i + nelt2 < nelt)
5362                 sel[3 * i + nelt2] = 0;
5363             }
5364           indices.new_vector (sel, 2, nelt);
5365           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5366
5367           for (i = 0; i < nelt; i++)
5368             {
5369               if (3 * i + nelt0 < nelt)
5370                 sel[3 * i + nelt0] = 3 * i + nelt0;
5371               if (3 * i + nelt1 < nelt)
5372                 sel[3 * i + nelt1] = 3 * i + nelt1;
5373               if (3 * i + nelt2 < nelt)
5374                 sel[3 * i + nelt2] = nelt + j2++;
5375             }
5376           indices.new_vector (sel, 2, nelt);
5377           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5378
5379           vect1 = dr_chain[0];
5380           vect2 = dr_chain[1];
5381
5382           /* Create interleaving stmt:
5383              low = VEC_PERM_EXPR <vect1, vect2,
5384                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5385                                    j + 2, nelt + j + 2, *, ...}>  */
5386           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5387           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5388                                            vect2, perm3_mask_low);
5389           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5390
5391           vect1 = data_ref;
5392           vect2 = dr_chain[2];
5393           /* Create interleaving stmt:
5394              low = VEC_PERM_EXPR <vect1, vect2,
5395                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5396                                    6, 7, nelt + j + 2, ...}>  */
5397           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5398           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5399                                            vect2, perm3_mask_high);
5400           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5401           (*result_chain)[j] = data_ref;
5402         }
5403     }
5404   else
5405     {
5406       /* If length is not equal to 3 then only power of 2 is supported.  */
5407       gcc_assert (pow2p_hwi (length));
5408
5409       /* The encoding has 2 interleaved stepped patterns.  */
5410       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5411       vec_perm_builder sel (nelt, 2, 3);
5412       sel.quick_grow (6);
5413       for (i = 0; i < 3; i++)
5414         {
5415           sel[i * 2] = i;
5416           sel[i * 2 + 1] = i + nelt;
5417         }
5418         vec_perm_indices indices (sel, 2, nelt);
5419         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5420
5421         for (i = 0; i < 6; i++)
5422           sel[i] += exact_div (nelt, 2);
5423         indices.new_vector (sel, 2, nelt);
5424         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5425
5426         for (i = 0, n = log_length; i < n; i++)
5427           {
5428             for (j = 0; j < length/2; j++)
5429               {
5430                 vect1 = dr_chain[j];
5431                 vect2 = dr_chain[j+length/2];
5432
5433                 /* Create interleaving stmt:
5434                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5435                                                         ...}>  */
5436                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5437                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5438                                                  vect2, perm_mask_high);
5439                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5440                 (*result_chain)[2*j] = high;
5441
5442                 /* Create interleaving stmt:
5443                    low = VEC_PERM_EXPR <vect1, vect2,
5444                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5445                                          ...}>  */
5446                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5447                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5448                                                  vect2, perm_mask_low);
5449                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5450                 (*result_chain)[2*j+1] = low;
5451               }
5452             memcpy (dr_chain.address (), result_chain->address (),
5453                     length * sizeof (tree));
5454           }
5455     }
5456 }
5457
5458 /* Function vect_setup_realignment
5459
5460    This function is called when vectorizing an unaligned load using
5461    the dr_explicit_realign[_optimized] scheme.
5462    This function generates the following code at the loop prolog:
5463
5464       p = initial_addr;
5465    x  msq_init = *(floor(p));   # prolog load
5466       realignment_token = call target_builtin;
5467     loop:
5468    x  msq = phi (msq_init, ---)
5469
5470    The stmts marked with x are generated only for the case of
5471    dr_explicit_realign_optimized.
5472
5473    The code above sets up a new (vector) pointer, pointing to the first
5474    location accessed by STMT_INFO, and a "floor-aligned" load using that
5475    pointer.  It also generates code to compute the "realignment-token"
5476    (if the relevant target hook was defined), and creates a phi-node at the
5477    loop-header bb whose arguments are the result of the prolog-load (created
5478    by this function) and the result of a load that takes place in the loop
5479    (to be created by the caller to this function).
5480
5481    For the case of dr_explicit_realign_optimized:
5482    The caller to this function uses the phi-result (msq) to create the
5483    realignment code inside the loop, and sets up the missing phi argument,
5484    as follows:
5485     loop:
5486       msq = phi (msq_init, lsq)
5487       lsq = *(floor(p'));        # load in loop
5488       result = realign_load (msq, lsq, realignment_token);
5489
5490    For the case of dr_explicit_realign:
5491     loop:
5492       msq = *(floor(p));        # load in loop
5493       p' = p + (VS-1);
5494       lsq = *(floor(p'));       # load in loop
5495       result = realign_load (msq, lsq, realignment_token);
5496
5497    Input:
5498    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5499                a memory location that may be unaligned.
5500    BSI - place where new code is to be inserted.
5501    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5502                               is used.
5503
5504    Output:
5505    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5506                        target hook, if defined.
5507    Return value - the result of the loop-header phi node.  */
5508
5509 tree
5510 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5511                         gimple_stmt_iterator *gsi, tree *realignment_token,
5512                         enum dr_alignment_support alignment_support_scheme,
5513                         tree init_addr,
5514                         class loop **at_loop)
5515 {
5516   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5517   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5518   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5519   struct data_reference *dr = dr_info->dr;
5520   class loop *loop = NULL;
5521   edge pe = NULL;
5522   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5523   tree vec_dest;
5524   gimple *inc;
5525   tree ptr;
5526   tree data_ref;
5527   basic_block new_bb;
5528   tree msq_init = NULL_TREE;
5529   tree new_temp;
5530   gphi *phi_stmt;
5531   tree msq = NULL_TREE;
5532   gimple_seq stmts = NULL;
5533   bool compute_in_loop = false;
5534   bool nested_in_vect_loop = false;
5535   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5536   class loop *loop_for_initial_load = NULL;
5537
5538   if (loop_vinfo)
5539     {
5540       loop = LOOP_VINFO_LOOP (loop_vinfo);
5541       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5542     }
5543
5544   gcc_assert (alignment_support_scheme == dr_explicit_realign
5545               || alignment_support_scheme == dr_explicit_realign_optimized);
5546
5547   /* We need to generate three things:
5548      1. the misalignment computation
5549      2. the extra vector load (for the optimized realignment scheme).
5550      3. the phi node for the two vectors from which the realignment is
5551       done (for the optimized realignment scheme).  */
5552
5553   /* 1. Determine where to generate the misalignment computation.
5554
5555      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5556      calculation will be generated by this function, outside the loop (in the
5557      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5558      caller, inside the loop.
5559
5560      Background: If the misalignment remains fixed throughout the iterations of
5561      the loop, then both realignment schemes are applicable, and also the
5562      misalignment computation can be done outside LOOP.  This is because we are
5563      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5564      are a multiple of VS (the Vector Size), and therefore the misalignment in
5565      different vectorized LOOP iterations is always the same.
5566      The problem arises only if the memory access is in an inner-loop nested
5567      inside LOOP, which is now being vectorized using outer-loop vectorization.
5568      This is the only case when the misalignment of the memory access may not
5569      remain fixed throughout the iterations of the inner-loop (as explained in
5570      detail in vect_supportable_dr_alignment).  In this case, not only is the
5571      optimized realignment scheme not applicable, but also the misalignment
5572      computation (and generation of the realignment token that is passed to
5573      REALIGN_LOAD) have to be done inside the loop.
5574
5575      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5576      or not, which in turn determines if the misalignment is computed inside
5577      the inner-loop, or outside LOOP.  */
5578
5579   if (init_addr != NULL_TREE || !loop_vinfo)
5580     {
5581       compute_in_loop = true;
5582       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5583     }
5584
5585
5586   /* 2. Determine where to generate the extra vector load.
5587
5588      For the optimized realignment scheme, instead of generating two vector
5589      loads in each iteration, we generate a single extra vector load in the
5590      preheader of the loop, and in each iteration reuse the result of the
5591      vector load from the previous iteration.  In case the memory access is in
5592      an inner-loop nested inside LOOP, which is now being vectorized using
5593      outer-loop vectorization, we need to determine whether this initial vector
5594      load should be generated at the preheader of the inner-loop, or can be
5595      generated at the preheader of LOOP.  If the memory access has no evolution
5596      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5597      to be generated inside LOOP (in the preheader of the inner-loop).  */
5598
5599   if (nested_in_vect_loop)
5600     {
5601       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5602       bool invariant_in_outerloop =
5603             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5604       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5605     }
5606   else
5607     loop_for_initial_load = loop;
5608   if (at_loop)
5609     *at_loop = loop_for_initial_load;
5610
5611   if (loop_for_initial_load)
5612     pe = loop_preheader_edge (loop_for_initial_load);
5613
5614   /* 3. For the case of the optimized realignment, create the first vector
5615       load at the loop preheader.  */
5616
5617   if (alignment_support_scheme == dr_explicit_realign_optimized)
5618     {
5619       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5620       gassign *new_stmt;
5621
5622       gcc_assert (!compute_in_loop);
5623       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5624       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5625                                       loop_for_initial_load, NULL_TREE,
5626                                       &init_addr, NULL, &inc, true);
5627       if (TREE_CODE (ptr) == SSA_NAME)
5628         new_temp = copy_ssa_name (ptr);
5629       else
5630         new_temp = make_ssa_name (TREE_TYPE (ptr));
5631       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5632       tree type = TREE_TYPE (ptr);
5633       new_stmt = gimple_build_assign
5634                    (new_temp, BIT_AND_EXPR, ptr,
5635                     fold_build2 (MINUS_EXPR, type,
5636                                  build_int_cst (type, 0),
5637                                  build_int_cst (type, align)));
5638       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5639       gcc_assert (!new_bb);
5640       data_ref
5641         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5642                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5643       vect_copy_ref_info (data_ref, DR_REF (dr));
5644       new_stmt = gimple_build_assign (vec_dest, data_ref);
5645       new_temp = make_ssa_name (vec_dest, new_stmt);
5646       gimple_assign_set_lhs (new_stmt, new_temp);
5647       if (pe)
5648         {
5649           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5650           gcc_assert (!new_bb);
5651         }
5652       else
5653          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5654
5655       msq_init = gimple_assign_lhs (new_stmt);
5656     }
5657
5658   /* 4. Create realignment token using a target builtin, if available.
5659       It is done either inside the containing loop, or before LOOP (as
5660       determined above).  */
5661
5662   if (targetm.vectorize.builtin_mask_for_load)
5663     {
5664       gcall *new_stmt;
5665       tree builtin_decl;
5666
5667       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5668       if (!init_addr)
5669         {
5670           /* Generate the INIT_ADDR computation outside LOOP.  */
5671           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5672                                                             stmt_info, &stmts,
5673                                                             NULL_TREE);
5674           if (loop)
5675             {
5676               pe = loop_preheader_edge (loop);
5677               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5678               gcc_assert (!new_bb);
5679             }
5680           else
5681              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5682         }
5683
5684       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5685       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5686       vec_dest =
5687         vect_create_destination_var (scalar_dest,
5688                                      gimple_call_return_type (new_stmt));
5689       new_temp = make_ssa_name (vec_dest, new_stmt);
5690       gimple_call_set_lhs (new_stmt, new_temp);
5691
5692       if (compute_in_loop)
5693         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5694       else
5695         {
5696           /* Generate the misalignment computation outside LOOP.  */
5697           pe = loop_preheader_edge (loop);
5698           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5699           gcc_assert (!new_bb);
5700         }
5701
5702       *realignment_token = gimple_call_lhs (new_stmt);
5703
5704       /* The result of the CALL_EXPR to this builtin is determined from
5705          the value of the parameter and no global variables are touched
5706          which makes the builtin a "const" function.  Requiring the
5707          builtin to have the "const" attribute makes it unnecessary
5708          to call mark_call_clobbered.  */
5709       gcc_assert (TREE_READONLY (builtin_decl));
5710     }
5711
5712   if (alignment_support_scheme == dr_explicit_realign)
5713     return msq;
5714
5715   gcc_assert (!compute_in_loop);
5716   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5717
5718
5719   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5720
5721   pe = loop_preheader_edge (containing_loop);
5722   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5723   msq = make_ssa_name (vec_dest);
5724   phi_stmt = create_phi_node (msq, containing_loop->header);
5725   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5726
5727   return msq;
5728 }
5729
5730
5731 /* Function vect_grouped_load_supported.
5732
5733    COUNT is the size of the load group (the number of statements plus the
5734    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5735    only one statement, with a gap of COUNT - 1.
5736
5737    Returns true if a suitable permute exists.  */
5738
5739 bool
5740 vect_grouped_load_supported (tree vectype, bool single_element_p,
5741                              unsigned HOST_WIDE_INT count)
5742 {
5743   machine_mode mode = TYPE_MODE (vectype);
5744
5745   /* If this is single-element interleaving with an element distance
5746      that leaves unused vector loads around punt - we at least create
5747      very sub-optimal code in that case (and blow up memory,
5748      see PR65518).  */
5749   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5750     {
5751       if (dump_enabled_p ())
5752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753                          "single-element interleaving not supported "
5754                          "for not adjacent vector loads\n");
5755       return false;
5756     }
5757
5758   /* vect_permute_load_chain requires the group size to be equal to 3 or
5759      be a power of two.  */
5760   if (count != 3 && exact_log2 (count) == -1)
5761     {
5762       if (dump_enabled_p ())
5763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                          "the size of the group of accesses"
5765                          " is not a power of 2 or not equal to 3\n");
5766       return false;
5767     }
5768
5769   /* Check that the permutation is supported.  */
5770   if (VECTOR_MODE_P (mode))
5771     {
5772       unsigned int i, j;
5773       if (count == 3)
5774         {
5775           unsigned int nelt;
5776           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5777             {
5778               if (dump_enabled_p ())
5779                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5780                                  "cannot handle groups of 3 loads for"
5781                                  " variable-length vectors\n");
5782               return false;
5783             }
5784
5785           vec_perm_builder sel (nelt, nelt, 1);
5786           sel.quick_grow (nelt);
5787           vec_perm_indices indices;
5788           unsigned int k;
5789           for (k = 0; k < 3; k++)
5790             {
5791               for (i = 0; i < nelt; i++)
5792                 if (3 * i + k < 2 * nelt)
5793                   sel[i] = 3 * i + k;
5794                 else
5795                   sel[i] = 0;
5796               indices.new_vector (sel, 2, nelt);
5797               if (!can_vec_perm_const_p (mode, indices))
5798                 {
5799                   if (dump_enabled_p ())
5800                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5801                                      "shuffle of 3 loads is not supported by"
5802                                      " target\n");
5803                   return false;
5804                 }
5805               for (i = 0, j = 0; i < nelt; i++)
5806                 if (3 * i + k < 2 * nelt)
5807                   sel[i] = i;
5808                 else
5809                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5810               indices.new_vector (sel, 2, nelt);
5811               if (!can_vec_perm_const_p (mode, indices))
5812                 {
5813                   if (dump_enabled_p ())
5814                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5815                                      "shuffle of 3 loads is not supported by"
5816                                      " target\n");
5817                   return false;
5818                 }
5819             }
5820           return true;
5821         }
5822       else
5823         {
5824           /* If length is not equal to 3 then only power of 2 is supported.  */
5825           gcc_assert (pow2p_hwi (count));
5826           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5827
5828           /* The encoding has a single stepped pattern.  */
5829           vec_perm_builder sel (nelt, 1, 3);
5830           sel.quick_grow (3);
5831           for (i = 0; i < 3; i++)
5832             sel[i] = i * 2;
5833           vec_perm_indices indices (sel, 2, nelt);
5834           if (can_vec_perm_const_p (mode, indices))
5835             {
5836               for (i = 0; i < 3; i++)
5837                 sel[i] = i * 2 + 1;
5838               indices.new_vector (sel, 2, nelt);
5839               if (can_vec_perm_const_p (mode, indices))
5840                 return true;
5841             }
5842         }
5843     }
5844
5845   if (dump_enabled_p ())
5846     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5847                      "extract even/odd not supported by target\n");
5848   return false;
5849 }
5850
5851 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5852    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5853
5854 bool
5855 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5856                            bool masked_p)
5857 {
5858   if (masked_p)
5859     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5860                                          vec_mask_load_lanes_optab,
5861                                          vectype, count);
5862   else
5863     return vect_lanes_optab_supported_p ("vec_load_lanes",
5864                                          vec_load_lanes_optab,
5865                                          vectype, count);
5866 }
5867
5868 /* Function vect_permute_load_chain.
5869
5870    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5871    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5872    the input data correctly.  Return the final references for loads in
5873    RESULT_CHAIN.
5874
5875    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5876    The input is 4 vectors each containing 8 elements. We assign a number to each
5877    element, the input sequence is:
5878
5879    1st vec:   0  1  2  3  4  5  6  7
5880    2nd vec:   8  9 10 11 12 13 14 15
5881    3rd vec:  16 17 18 19 20 21 22 23
5882    4th vec:  24 25 26 27 28 29 30 31
5883
5884    The output sequence should be:
5885
5886    1st vec:  0 4  8 12 16 20 24 28
5887    2nd vec:  1 5  9 13 17 21 25 29
5888    3rd vec:  2 6 10 14 18 22 26 30
5889    4th vec:  3 7 11 15 19 23 27 31
5890
5891    i.e., the first output vector should contain the first elements of each
5892    interleaving group, etc.
5893
5894    We use extract_even/odd instructions to create such output.  The input of
5895    each extract_even/odd operation is two vectors
5896    1st vec    2nd vec
5897    0 1 2 3    4 5 6 7
5898
5899    and the output is the vector of extracted even/odd elements.  The output of
5900    extract_even will be:   0 2 4 6
5901    and of extract_odd:     1 3 5 7
5902
5903
5904    The permutation is done in log LENGTH stages.  In each stage extract_even
5905    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5906    their order.  In our example,
5907
5908    E1: extract_even (1st vec, 2nd vec)
5909    E2: extract_odd (1st vec, 2nd vec)
5910    E3: extract_even (3rd vec, 4th vec)
5911    E4: extract_odd (3rd vec, 4th vec)
5912
5913    The output for the first stage will be:
5914
5915    E1:  0  2  4  6  8 10 12 14
5916    E2:  1  3  5  7  9 11 13 15
5917    E3: 16 18 20 22 24 26 28 30
5918    E4: 17 19 21 23 25 27 29 31
5919
5920    In order to proceed and create the correct sequence for the next stage (or
5921    for the correct output, if the second stage is the last one, as in our
5922    example), we first put the output of extract_even operation and then the
5923    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5924    The input for the second stage is:
5925
5926    1st vec (E1):  0  2  4  6  8 10 12 14
5927    2nd vec (E3): 16 18 20 22 24 26 28 30
5928    3rd vec (E2):  1  3  5  7  9 11 13 15
5929    4th vec (E4): 17 19 21 23 25 27 29 31
5930
5931    The output of the second stage:
5932
5933    E1: 0 4  8 12 16 20 24 28
5934    E2: 2 6 10 14 18 22 26 30
5935    E3: 1 5  9 13 17 21 25 29
5936    E4: 3 7 11 15 19 23 27 31
5937
5938    And RESULT_CHAIN after reordering:
5939
5940    1st vec (E1):  0 4  8 12 16 20 24 28
5941    2nd vec (E3):  1 5  9 13 17 21 25 29
5942    3rd vec (E2):  2 6 10 14 18 22 26 30
5943    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5944
5945 static void
5946 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5947                          unsigned int length,
5948                          stmt_vec_info stmt_info,
5949                          gimple_stmt_iterator *gsi,
5950                          vec<tree> *result_chain)
5951 {
5952   tree data_ref, first_vect, second_vect;
5953   tree perm_mask_even, perm_mask_odd;
5954   tree perm3_mask_low, perm3_mask_high;
5955   gimple *perm_stmt;
5956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5957   unsigned int i, j, log_length = exact_log2 (length);
5958
5959   result_chain->quick_grow (length);
5960   memcpy (result_chain->address (), dr_chain.address (),
5961           length * sizeof (tree));
5962
5963   if (length == 3)
5964     {
5965       /* vect_grouped_load_supported ensures that this is constant.  */
5966       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5967       unsigned int k;
5968
5969       vec_perm_builder sel (nelt, nelt, 1);
5970       sel.quick_grow (nelt);
5971       vec_perm_indices indices;
5972       for (k = 0; k < 3; k++)
5973         {
5974           for (i = 0; i < nelt; i++)
5975             if (3 * i + k < 2 * nelt)
5976               sel[i] = 3 * i + k;
5977             else
5978               sel[i] = 0;
5979           indices.new_vector (sel, 2, nelt);
5980           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5981
5982           for (i = 0, j = 0; i < nelt; i++)
5983             if (3 * i + k < 2 * nelt)
5984               sel[i] = i;
5985             else
5986               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5987           indices.new_vector (sel, 2, nelt);
5988           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5989
5990           first_vect = dr_chain[0];
5991           second_vect = dr_chain[1];
5992
5993           /* Create interleaving stmt (low part of):
5994              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5995                                                              ...}>  */
5996           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5997           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5998                                            second_vect, perm3_mask_low);
5999           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6000
6001           /* Create interleaving stmt (high part of):
6002              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6003                                                               ...}>  */
6004           first_vect = data_ref;
6005           second_vect = dr_chain[2];
6006           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6007           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6008                                            second_vect, perm3_mask_high);
6009           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6010           (*result_chain)[k] = data_ref;
6011         }
6012     }
6013   else
6014     {
6015       /* If length is not equal to 3 then only power of 2 is supported.  */
6016       gcc_assert (pow2p_hwi (length));
6017
6018       /* The encoding has a single stepped pattern.  */
6019       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6020       vec_perm_builder sel (nelt, 1, 3);
6021       sel.quick_grow (3);
6022       for (i = 0; i < 3; ++i)
6023         sel[i] = i * 2;
6024       vec_perm_indices indices (sel, 2, nelt);
6025       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6026
6027       for (i = 0; i < 3; ++i)
6028         sel[i] = i * 2 + 1;
6029       indices.new_vector (sel, 2, nelt);
6030       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6031
6032       for (i = 0; i < log_length; i++)
6033         {
6034           for (j = 0; j < length; j += 2)
6035             {
6036               first_vect = dr_chain[j];
6037               second_vect = dr_chain[j+1];
6038
6039               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6040               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6041               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6042                                                first_vect, second_vect,
6043                                                perm_mask_even);
6044               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6045               (*result_chain)[j/2] = data_ref;
6046
6047               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6048               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6049               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6050                                                first_vect, second_vect,
6051                                                perm_mask_odd);
6052               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6053               (*result_chain)[j/2+length/2] = data_ref;
6054             }
6055           memcpy (dr_chain.address (), result_chain->address (),
6056                   length * sizeof (tree));
6057         }
6058     }
6059 }
6060
6061 /* Function vect_shift_permute_load_chain.
6062
6063    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6064    sequence of stmts to reorder the input data accordingly.
6065    Return the final references for loads in RESULT_CHAIN.
6066    Return true if successed, false otherwise.
6067
6068    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6069    The input is 3 vectors each containing 8 elements.  We assign a
6070    number to each element, the input sequence is:
6071
6072    1st vec:   0  1  2  3  4  5  6  7
6073    2nd vec:   8  9 10 11 12 13 14 15
6074    3rd vec:  16 17 18 19 20 21 22 23
6075
6076    The output sequence should be:
6077
6078    1st vec:  0 3 6  9 12 15 18 21
6079    2nd vec:  1 4 7 10 13 16 19 22
6080    3rd vec:  2 5 8 11 14 17 20 23
6081
6082    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6083
6084    First we shuffle all 3 vectors to get correct elements order:
6085
6086    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6087    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6088    3rd vec:  (16 19 22) (17 20 23) (18 21)
6089
6090    Next we unite and shift vector 3 times:
6091
6092    1st step:
6093      shift right by 6 the concatenation of:
6094      "1st vec" and  "2nd vec"
6095        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6096      "2nd vec" and  "3rd vec"
6097        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6098      "3rd vec" and  "1st vec"
6099        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6100                              | New vectors                   |
6101
6102      So that now new vectors are:
6103
6104      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6105      2nd vec:  (10 13) (16 19 22) (17 20 23)
6106      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6107
6108    2nd step:
6109      shift right by 5 the concatenation of:
6110      "1st vec" and  "3rd vec"
6111        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6112      "2nd vec" and  "1st vec"
6113        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6114      "3rd vec" and  "2nd vec"
6115        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6116                           | New vectors                   |
6117
6118      So that now new vectors are:
6119
6120      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6121      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6122      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6123
6124    3rd step:
6125      shift right by 5 the concatenation of:
6126      "1st vec" and  "1st vec"
6127        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6128      shift right by 3 the concatenation of:
6129      "2nd vec" and  "2nd vec"
6130                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6131                           | New vectors                   |
6132
6133      So that now all vectors are READY:
6134      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6135      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6136      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6137
6138    This algorithm is faster than one in vect_permute_load_chain if:
6139      1.  "shift of a concatination" is faster than general permutation.
6140          This is usually so.
6141      2.  The TARGET machine can't execute vector instructions in parallel.
6142          This is because each step of the algorithm depends on previous.
6143          The algorithm in vect_permute_load_chain is much more parallel.
6144
6145    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6146 */
6147
6148 static bool
6149 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6150                                unsigned int length,
6151                                stmt_vec_info stmt_info,
6152                                gimple_stmt_iterator *gsi,
6153                                vec<tree> *result_chain)
6154 {
6155   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6156   tree perm2_mask1, perm2_mask2, perm3_mask;
6157   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6158   gimple *perm_stmt;
6159
6160   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6161   unsigned int i;
6162   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6163
6164   unsigned HOST_WIDE_INT nelt, vf;
6165   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6166       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6167     /* Not supported for variable-length vectors.  */
6168     return false;
6169
6170   vec_perm_builder sel (nelt, nelt, 1);
6171   sel.quick_grow (nelt);
6172
6173   result_chain->quick_grow (length);
6174   memcpy (result_chain->address (), dr_chain.address (),
6175           length * sizeof (tree));
6176
6177   if (pow2p_hwi (length) && vf > 4)
6178     {
6179       unsigned int j, log_length = exact_log2 (length);
6180       for (i = 0; i < nelt / 2; ++i)
6181         sel[i] = i * 2;
6182       for (i = 0; i < nelt / 2; ++i)
6183         sel[nelt / 2 + i] = i * 2 + 1;
6184       vec_perm_indices indices (sel, 2, nelt);
6185       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6186         {
6187           if (dump_enabled_p ())
6188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6189                              "shuffle of 2 fields structure is not \
6190                               supported by target\n");
6191           return false;
6192         }
6193       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6194
6195       for (i = 0; i < nelt / 2; ++i)
6196         sel[i] = i * 2 + 1;
6197       for (i = 0; i < nelt / 2; ++i)
6198         sel[nelt / 2 + i] = i * 2;
6199       indices.new_vector (sel, 2, nelt);
6200       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6201         {
6202           if (dump_enabled_p ())
6203             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6204                              "shuffle of 2 fields structure is not \
6205                               supported by target\n");
6206           return false;
6207         }
6208       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6209
6210       /* Generating permutation constant to shift all elements.
6211          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6212       for (i = 0; i < nelt; i++)
6213         sel[i] = nelt / 2 + i;
6214       indices.new_vector (sel, 2, nelt);
6215       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6216         {
6217           if (dump_enabled_p ())
6218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6219                              "shift permutation is not supported by target\n");
6220           return false;
6221         }
6222       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6223
6224       /* Generating permutation constant to select vector from 2.
6225          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6226       for (i = 0; i < nelt / 2; i++)
6227         sel[i] = i;
6228       for (i = nelt / 2; i < nelt; i++)
6229         sel[i] = nelt + i;
6230       indices.new_vector (sel, 2, nelt);
6231       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6232         {
6233           if (dump_enabled_p ())
6234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6235                              "select is not supported by target\n");
6236           return false;
6237         }
6238       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6239
6240       for (i = 0; i < log_length; i++)
6241         {
6242           for (j = 0; j < length; j += 2)
6243             {
6244               first_vect = dr_chain[j];
6245               second_vect = dr_chain[j + 1];
6246
6247               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6248               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6249                                                first_vect, first_vect,
6250                                                perm2_mask1);
6251               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6252               vect[0] = data_ref;
6253
6254               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6255               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6256                                                second_vect, second_vect,
6257                                                perm2_mask2);
6258               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6259               vect[1] = data_ref;
6260
6261               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6262               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6263                                                vect[0], vect[1], shift1_mask);
6264               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6265               (*result_chain)[j/2 + length/2] = data_ref;
6266
6267               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6268               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6269                                                vect[0], vect[1], select_mask);
6270               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6271               (*result_chain)[j/2] = data_ref;
6272             }
6273           memcpy (dr_chain.address (), result_chain->address (),
6274                   length * sizeof (tree));
6275         }
6276       return true;
6277     }
6278   if (length == 3 && vf > 2)
6279     {
6280       unsigned int k = 0, l = 0;
6281
6282       /* Generating permutation constant to get all elements in rigth order.
6283          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6284       for (i = 0; i < nelt; i++)
6285         {
6286           if (3 * k + (l % 3) >= nelt)
6287             {
6288               k = 0;
6289               l += (3 - (nelt % 3));
6290             }
6291           sel[i] = 3 * k + (l % 3);
6292           k++;
6293         }
6294       vec_perm_indices indices (sel, 2, nelt);
6295       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6296         {
6297           if (dump_enabled_p ())
6298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6299                              "shuffle of 3 fields structure is not \
6300                               supported by target\n");
6301           return false;
6302         }
6303       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6304
6305       /* Generating permutation constant to shift all elements.
6306          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6307       for (i = 0; i < nelt; i++)
6308         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6309       indices.new_vector (sel, 2, nelt);
6310       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6311         {
6312           if (dump_enabled_p ())
6313             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314                              "shift permutation is not supported by target\n");
6315           return false;
6316         }
6317       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6318
6319       /* Generating permutation constant to shift all elements.
6320          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6321       for (i = 0; i < nelt; i++)
6322         sel[i] = 2 * (nelt / 3) + 1 + i;
6323       indices.new_vector (sel, 2, nelt);
6324       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6325         {
6326           if (dump_enabled_p ())
6327             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6328                              "shift permutation is not supported by target\n");
6329           return false;
6330         }
6331       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6332
6333       /* Generating permutation constant to shift all elements.
6334          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6335       for (i = 0; i < nelt; i++)
6336         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6337       indices.new_vector (sel, 2, nelt);
6338       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6339         {
6340           if (dump_enabled_p ())
6341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342                              "shift permutation is not supported by target\n");
6343           return false;
6344         }
6345       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6346
6347       /* Generating permutation constant to shift all elements.
6348          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6349       for (i = 0; i < nelt; i++)
6350         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6351       indices.new_vector (sel, 2, nelt);
6352       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6353         {
6354           if (dump_enabled_p ())
6355             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6356                              "shift permutation is not supported by target\n");
6357           return false;
6358         }
6359       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6360
6361       for (k = 0; k < 3; k++)
6362         {
6363           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6364           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6365                                            dr_chain[k], dr_chain[k],
6366                                            perm3_mask);
6367           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6368           vect[k] = data_ref;
6369         }
6370
6371       for (k = 0; k < 3; k++)
6372         {
6373           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6374           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6375                                            vect[k % 3], vect[(k + 1) % 3],
6376                                            shift1_mask);
6377           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6378           vect_shift[k] = data_ref;
6379         }
6380
6381       for (k = 0; k < 3; k++)
6382         {
6383           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6384           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6385                                            vect_shift[(4 - k) % 3],
6386                                            vect_shift[(3 - k) % 3],
6387                                            shift2_mask);
6388           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6389           vect[k] = data_ref;
6390         }
6391
6392       (*result_chain)[3 - (nelt % 3)] = vect[2];
6393
6394       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6395       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6396                                        vect[0], shift3_mask);
6397       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6398       (*result_chain)[nelt % 3] = data_ref;
6399
6400       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6401       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6402                                        vect[1], shift4_mask);
6403       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6404       (*result_chain)[0] = data_ref;
6405       return true;
6406     }
6407   return false;
6408 }
6409
6410 /* Function vect_transform_grouped_load.
6411
6412    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6413    to perform their permutation and ascribe the result vectorized statements to
6414    the scalar statements.
6415 */
6416
6417 void
6418 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6419                              vec<tree> dr_chain,
6420                              int size, gimple_stmt_iterator *gsi)
6421 {
6422   machine_mode mode;
6423   vec<tree> result_chain = vNULL;
6424
6425   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6426      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6427      vectors, that are ready for vector computation.  */
6428   result_chain.create (size);
6429
6430   /* If reassociation width for vector type is 2 or greater target machine can
6431      execute 2 or more vector instructions in parallel.  Otherwise try to
6432      get chain for loads group using vect_shift_permute_load_chain.  */
6433   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6434   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6435       || pow2p_hwi (size)
6436       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6437                                          gsi, &result_chain))
6438     vect_permute_load_chain (vinfo, dr_chain,
6439                              size, stmt_info, gsi, &result_chain);
6440   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6441   result_chain.release ();
6442 }
6443
6444 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6445    generated as part of the vectorization of STMT_INFO.  Assign the statement
6446    for each vector to the associated scalar statement.  */
6447
6448 void
6449 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6450                                   vec<tree> result_chain)
6451 {
6452   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6453   unsigned int i, gap_count;
6454   tree tmp_data_ref;
6455
6456   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6457      Since we scan the chain starting from it's first node, their order
6458      corresponds the order of data-refs in RESULT_CHAIN.  */
6459   stmt_vec_info next_stmt_info = first_stmt_info;
6460   gap_count = 1;
6461   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6462     {
6463       if (!next_stmt_info)
6464         break;
6465
6466       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6467        code elimination pass later.  No need to check for the first stmt in
6468        the group, since it always exists.
6469        DR_GROUP_GAP is the number of steps in elements from the previous
6470        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6471        correspond to the gaps.  */
6472       if (next_stmt_info != first_stmt_info
6473           && gap_count < DR_GROUP_GAP (next_stmt_info))
6474         {
6475           gap_count++;
6476           continue;
6477         }
6478
6479       /* ???  The following needs cleanup after the removal of
6480          DR_GROUP_SAME_DR_STMT.  */
6481       if (next_stmt_info)
6482         {
6483           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6484           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6485              copies, and we put the new vector statement last.  */
6486           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6487
6488           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6489           gap_count = 1;
6490         }
6491     }
6492 }
6493
6494 /* Function vect_force_dr_alignment_p.
6495
6496    Returns whether the alignment of a DECL can be forced to be aligned
6497    on ALIGNMENT bit boundary.  */
6498
6499 bool
6500 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6501 {
6502   if (!VAR_P (decl))
6503     return false;
6504
6505   if (decl_in_symtab_p (decl)
6506       && !symtab_node::get (decl)->can_increase_alignment_p ())
6507     return false;
6508
6509   if (TREE_STATIC (decl))
6510     return (known_le (alignment,
6511                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6512   else
6513     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6514 }
6515
6516
6517 /* Return whether the data reference DR_INFO is supported with respect to its
6518    alignment.
6519    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6520    it is aligned, i.e., check if it is possible to vectorize it with different
6521    alignment.  */
6522
6523 enum dr_alignment_support
6524 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6525                                bool check_aligned_accesses)
6526 {
6527   data_reference *dr = dr_info->dr;
6528   stmt_vec_info stmt_info = dr_info->stmt;
6529   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6530   machine_mode mode = TYPE_MODE (vectype);
6531   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6532   class loop *vect_loop = NULL;
6533   bool nested_in_vect_loop = false;
6534
6535   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6536     return dr_aligned;
6537
6538   /* For now assume all conditional loads/stores support unaligned
6539      access without any special code.  */
6540   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6541     if (gimple_call_internal_p (stmt)
6542         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6543             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6544       return dr_unaligned_supported;
6545
6546   if (loop_vinfo)
6547     {
6548       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6549       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6550     }
6551
6552   /* Possibly unaligned access.  */
6553
6554   /* We can choose between using the implicit realignment scheme (generating
6555      a misaligned_move stmt) and the explicit realignment scheme (generating
6556      aligned loads with a REALIGN_LOAD).  There are two variants to the
6557      explicit realignment scheme: optimized, and unoptimized.
6558      We can optimize the realignment only if the step between consecutive
6559      vector loads is equal to the vector size.  Since the vector memory
6560      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6561      is guaranteed that the misalignment amount remains the same throughout the
6562      execution of the vectorized loop.  Therefore, we can create the
6563      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6564      at the loop preheader.
6565
6566      However, in the case of outer-loop vectorization, when vectorizing a
6567      memory access in the inner-loop nested within the LOOP that is now being
6568      vectorized, while it is guaranteed that the misalignment of the
6569      vectorized memory access will remain the same in different outer-loop
6570      iterations, it is *not* guaranteed that is will remain the same throughout
6571      the execution of the inner-loop.  This is because the inner-loop advances
6572      with the original scalar step (and not in steps of VS).  If the inner-loop
6573      step happens to be a multiple of VS, then the misalignment remains fixed
6574      and we can use the optimized realignment scheme.  For example:
6575
6576       for (i=0; i<N; i++)
6577         for (j=0; j<M; j++)
6578           s += a[i+j];
6579
6580      When vectorizing the i-loop in the above example, the step between
6581      consecutive vector loads is 1, and so the misalignment does not remain
6582      fixed across the execution of the inner-loop, and the realignment cannot
6583      be optimized (as illustrated in the following pseudo vectorized loop):
6584
6585       for (i=0; i<N; i+=4)
6586         for (j=0; j<M; j++){
6587           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6588                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6589                          // (assuming that we start from an aligned address).
6590           }
6591
6592      We therefore have to use the unoptimized realignment scheme:
6593
6594       for (i=0; i<N; i+=4)
6595           for (j=k; j<M; j+=4)
6596           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6597                            // that the misalignment of the initial address is
6598                            // 0).
6599
6600      The loop can then be vectorized as follows:
6601
6602       for (k=0; k<4; k++){
6603         rt = get_realignment_token (&vp[k]);
6604         for (i=0; i<N; i+=4){
6605           v1 = vp[i+k];
6606           for (j=k; j<M; j+=4){
6607             v2 = vp[i+j+VS-1];
6608             va = REALIGN_LOAD <v1,v2,rt>;
6609             vs += va;
6610             v1 = v2;
6611           }
6612         }
6613     } */
6614
6615   if (DR_IS_READ (dr))
6616     {
6617       bool is_packed = false;
6618       tree type = (TREE_TYPE (DR_REF (dr)));
6619
6620       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6621           && (!targetm.vectorize.builtin_mask_for_load
6622               || targetm.vectorize.builtin_mask_for_load ()))
6623         {
6624           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6625
6626           /* If we are doing SLP then the accesses need not have the
6627              same alignment, instead it depends on the SLP group size.  */
6628           if (loop_vinfo
6629               && STMT_SLP_TYPE (stmt_info)
6630               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6631                               * (DR_GROUP_SIZE
6632                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6633                               TYPE_VECTOR_SUBPARTS (vectype)))
6634             ;
6635           else if (!loop_vinfo
6636                    || (nested_in_vect_loop
6637                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6638                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6639             return dr_explicit_realign;
6640           else
6641             return dr_explicit_realign_optimized;
6642         }
6643       if (!known_alignment_for_access_p (dr_info))
6644         is_packed = not_size_aligned (DR_REF (dr));
6645
6646       if (targetm.vectorize.support_vector_misalignment
6647             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6648         /* Can't software pipeline the loads, but can at least do them.  */
6649         return dr_unaligned_supported;
6650     }
6651   else
6652     {
6653       bool is_packed = false;
6654       tree type = (TREE_TYPE (DR_REF (dr)));
6655
6656       if (!known_alignment_for_access_p (dr_info))
6657         is_packed = not_size_aligned (DR_REF (dr));
6658
6659      if (targetm.vectorize.support_vector_misalignment
6660            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6661        return dr_unaligned_supported;
6662     }
6663
6664   /* Unsupported.  */
6665   return dr_unaligned_unsupported;
6666 }