gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 236      emitted at the position of the first scalar load.
 237      Stores in a group are emitted at the position of the last scalar store.
 238      Compute that position and check whether the resulting order matches
 239      the current one.  */
 240   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 241   if (il_a)
 242     {
 243       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 244         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 245              s = DR_GROUP_NEXT_ELEMENT (s))
 246           il_a = get_later_stmt (il_a, s);
 247       else /* DR_IS_READ */
 248         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 249              s = DR_GROUP_NEXT_ELEMENT (s))
 250           if (get_later_stmt (il_a, s) == il_a)
 251             il_a = s;
 252     }
 253   else
 254     il_a = stmtinfo_a;
 255   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 256   if (il_b)
 257     {
 258       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 259         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 260              s = DR_GROUP_NEXT_ELEMENT (s))
 261           il_b = get_later_stmt (il_b, s);
 262       else /* DR_IS_READ */
 263         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 264              s = DR_GROUP_NEXT_ELEMENT (s))
 265           if (get_later_stmt (il_b, s) == il_b)
 266             il_b = s;
 267     }
 268   else
 269     il_b = stmtinfo_b;
 270   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 271   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 272 }
 273
 274 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 275    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 276    distances.  These distances are conservatively correct but they don't
 277    reflect a guaranteed dependence.
 278
 279    Return true if this function does all the work necessary to avoid
 280    an alias or false if the caller should use the dependence distances
 281    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 282    the depth of the loop described by LOOP_VINFO and the other arguments
 283    are as for vect_analyze_data_ref_dependence.  */
 284
 285 static bool
 286 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 287                                        loop_vec_info loop_vinfo,
 288                                        int loop_depth, unsigned int *max_vf)
 289 {
 290   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 291   lambda_vector dist_v;
 292   unsigned int i;
 293   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 294     {
 295       int dist = dist_v[loop_depth];
 296       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 297         {
 298           /* If the user asserted safelen >= DIST consecutive iterations
 299              can be executed concurrently, assume independence.
 300
 301              ??? An alternative would be to add the alias check even
 302              in this case, and vectorize the fallback loop with the
 303              maximum VF set to safelen.  However, if the user has
 304              explicitly given a length, it's less likely that that
 305              would be a win.  */
 306           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 307             {
 308               if ((unsigned int) loop->safelen < *max_vf)
 309                 *max_vf = loop->safelen;
 310               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 311               continue;
 312             }
 313
 314           /* For dependence distances of 2 or more, we have the option
 315              of limiting VF or checking for an alias at runtime.
 316              Prefer to check at runtime if we can, to avoid limiting
 317              the VF unnecessarily when the bases are in fact independent.
 318
 319              Note that the alias checks will be removed if the VF ends up
 320              being small enough.  */
 321           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 322           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 323           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 324                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 325                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 326         }
 327     }
 328   return true;
 329 }
 330
 331
 332 /* Function vect_analyze_data_ref_dependence.
 333
 334    FIXME: I needed to change the sense of the returned flag.
 335
 336    Return FALSE if there (might) exist a dependence between a memory-reference
 337    DRA and a memory-reference DRB.  When versioning for alias may check a
 338    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 339    the data dependence.  */
 340
 341 static opt_result
 342 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 343                                   loop_vec_info loop_vinfo,
 344                                   unsigned int *max_vf)
 345 {
 346   unsigned int i;
 347   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 348   struct data_reference *dra = DDR_A (ddr);
 349   struct data_reference *drb = DDR_B (ddr);
 350   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 351   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 352   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 353   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 354   lambda_vector dist_v;
 355   unsigned int loop_depth;
 356
 357   /* In loop analysis all data references should be vectorizable.  */
 358   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 359       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 360     gcc_unreachable ();
 361
 362   /* Independent data accesses.  */
 363   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 364     return opt_result::success ();
 365
 366   if (dra == drb
 367       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 368     return opt_result::success ();
 369
 370   /* We do not have to consider dependences between accesses that belong
 371      to the same group, unless the stride could be smaller than the
 372      group size.  */
 373   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 374       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 375           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 376       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 377     return opt_result::success ();
 378
 379   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 380      least two scalar iterations, there is always also a true dependence.
 381      As the vectorizer does not re-order loads and stores we can ignore
 382      the anti-dependence if TBAA can disambiguate both DRs similar to the
 383      case with known negative distance anti-dependences (positive
 384      distance anti-dependences would violate TBAA constraints).  */
 385   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 386        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 387       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 388                                  get_alias_set (DR_REF (drb))))
 389     return opt_result::success ();
 390
 391   /* Unknown data dependence.  */
 392   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 393     {
 394       /* If user asserted safelen consecutive iterations can be
 395          executed concurrently, assume independence.  */
 396       if (loop->safelen >= 2)
 397         {
 398           if ((unsigned int) loop->safelen < *max_vf)
 399             *max_vf = loop->safelen;
 400           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 401           return opt_result::success ();
 402         }
 403
 404       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 405           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 406         return opt_result::failure_at
 407           (stmtinfo_a->stmt,
 408            "versioning for alias not supported for: "
 409            "can't determine dependence between %T and %T\n",
 410            DR_REF (dra), DR_REF (drb));
 411
 412       if (dump_enabled_p ())
 413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 414                          "versioning for alias required: "
 415                          "can't determine dependence between %T and %T\n",
 416                          DR_REF (dra), DR_REF (drb));
 417
 418       /* Add to list of ddrs that need to be tested at run-time.  */
 419       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 420     }
 421
 422   /* Known data dependence.  */
 423   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 424     {
 425       /* If user asserted safelen consecutive iterations can be
 426          executed concurrently, assume independence.  */
 427       if (loop->safelen >= 2)
 428         {
 429           if ((unsigned int) loop->safelen < *max_vf)
 430             *max_vf = loop->safelen;
 431           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 432           return opt_result::success ();
 433         }
 434
 435       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 436           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 437         return opt_result::failure_at
 438           (stmtinfo_a->stmt,
 439            "versioning for alias not supported for: "
 440            "bad dist vector for %T and %T\n",
 441            DR_REF (dra), DR_REF (drb));
 442
 443       if (dump_enabled_p ())
 444         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 445                          "versioning for alias required: "
 446                          "bad dist vector for %T and %T\n",
 447                          DR_REF (dra), DR_REF (drb));
 448       /* Add to list of ddrs that need to be tested at run-time.  */
 449       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 450     }
 451
 452   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 453
 454   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 455       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 456                                                 loop_depth, max_vf))
 457     return opt_result::success ();
 458
 459   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 460     {
 461       int dist = dist_v[loop_depth];
 462
 463       if (dump_enabled_p ())
 464         dump_printf_loc (MSG_NOTE, vect_location,
 465                          "dependence distance  = %d.\n", dist);
 466
 467       if (dist == 0)
 468         {
 469           if (dump_enabled_p ())
 470             dump_printf_loc (MSG_NOTE, vect_location,
 471                              "dependence distance == 0 between %T and %T\n",
 472                              DR_REF (dra), DR_REF (drb));
 473
 474           /* When we perform grouped accesses and perform implicit CSE
 475              by detecting equal accesses and doing disambiguation with
 476              runtime alias tests like for
 477                 .. = a[i];
 478                 .. = a[i+1];
 479                 a[i] = ..;
 480                 a[i+1] = ..;
 481                 *p = ..;
 482                 .. = a[i];
 483                 .. = a[i+1];
 484              where we will end up loading { a[i], a[i+1] } once, make
 485              sure that inserting group loads before the first load and
 486              stores after the last store will do the right thing.
 487              Similar for groups like
 488                 a[i] = ...;
 489                 ... = a[i];
 490                 a[i+1] = ...;
 491              where loads from the group interleave with the store.  */
 492           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 493             return opt_result::failure_at (stmtinfo_a->stmt,
 494                                            "READ_WRITE dependence"
 495                                            " in interleaving.\n");
 496
 497           if (loop->safelen < 2)
 498             {
 499               tree indicator = dr_zero_step_indicator (dra);
 500               if (!indicator || integer_zerop (indicator))
 501                 return opt_result::failure_at (stmtinfo_a->stmt,
 502                                                "access also has a zero step\n");
 503               else if (TREE_CODE (indicator) != INTEGER_CST)
 504                 vect_check_nonzero_value (loop_vinfo, indicator);
 505             }
 506           continue;
 507         }
 508
 509       if (dist > 0 && DDR_REVERSED_P (ddr))
 510         {
 511           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 512              reversed (to make distance vector positive), and the actual
 513              distance is negative.  */
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location,
 516                              "dependence distance negative.\n");
 517           /* When doing outer loop vectorization, we need to check if there is
 518              a backward dependence at the inner loop level if the dependence
 519              at the outer loop is reversed.  See PR81740.  */
 520           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 521               || nested_in_vect_loop_p (loop, stmtinfo_b))
 522             {
 523               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 524                                                          DDR_LOOP_NEST (ddr));
 525               if (dist_v[inner_depth] < 0)
 526                 return opt_result::failure_at (stmtinfo_a->stmt,
 527                                                "not vectorized, dependence "
 528                                                "between data-refs %T and %T\n",
 529                                                DR_REF (dra), DR_REF (drb));
 530             }
 531           /* Record a negative dependence distance to later limit the
 532              amount of stmt copying / unrolling we can perform.
 533              Only need to handle read-after-write dependence.  */
 534           if (DR_IS_READ (drb)
 535               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 536                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 537             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 538           continue;
 539         }
 540
 541       unsigned int abs_dist = abs (dist);
 542       if (abs_dist >= 2 && abs_dist < *max_vf)
 543         {
 544           /* The dependence distance requires reduction of the maximal
 545              vectorization factor.  */
 546           *max_vf = abs_dist;
 547           if (dump_enabled_p ())
 548             dump_printf_loc (MSG_NOTE, vect_location,
 549                              "adjusting maximal vectorization factor to %i\n",
 550                              *max_vf);
 551         }
 552
 553       if (abs_dist >= *max_vf)
 554         {
 555           /* Dependence distance does not create dependence, as far as
 556              vectorization is concerned, in this case.  */
 557           if (dump_enabled_p ())
 558             dump_printf_loc (MSG_NOTE, vect_location,
 559                              "dependence distance >= VF.\n");
 560           continue;
 561         }
 562
 563       return opt_result::failure_at (stmtinfo_a->stmt,
 564                                      "not vectorized, possible dependence "
 565                                      "between data-refs %T and %T\n",
 566                                      DR_REF (dra), DR_REF (drb));
 567     }
 568
 569   return opt_result::success ();
 570 }
 571
 572 /* Function vect_analyze_data_ref_dependences.
 573
 574    Examine all the data references in the loop, and make sure there do not
 575    exist any data dependences between them.  Set *MAX_VF according to
 576    the maximum vectorization factor the data dependences allow.  */
 577
 578 opt_result
 579 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 580                                    unsigned int *max_vf)
 581 {
 582   unsigned int i;
 583   struct data_dependence_relation *ddr;
 584
 585   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 586
 587   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 588     {
 589       LOOP_VINFO_DDRS (loop_vinfo)
 590         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 591                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 592       /* We need read-read dependences to compute
 593          STMT_VINFO_SAME_ALIGN_REFS.  */
 594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 595                                           &LOOP_VINFO_DDRS (loop_vinfo),
 596                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 597                                           true);
 598       gcc_assert (res);
 599     }
 600
 601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 602
 603   /* For epilogues we either have no aliases or alias versioning
 604      was applied to original loop.  Therefore we may just get max_vf
 605      using VF of original loop.  */
 606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 608   else
 609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 610       {
 611         opt_result res
 612           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 613         if (!res)
 614           return res;
 615       }
 616
 617   return opt_result::success ();
 618 }
 619
 620
 621 /* Function vect_slp_analyze_data_ref_dependence.
 622
 623    Return TRUE if there (might) exist a dependence between a memory-reference
 624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 626    according to the data dependence.  */
 627
 628 static bool
 629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 630                                       struct data_dependence_relation *ddr)
 631 {
 632   struct data_reference *dra = DDR_A (ddr);
 633   struct data_reference *drb = DDR_B (ddr);
 634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 636
 637   /* We need to check dependences of statements marked as unvectorizable
 638      as well, they still can prohibit vectorization.  */
 639
 640   /* Independent data accesses.  */
 641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 642     return false;
 643
 644   if (dra == drb)
 645     return false;
 646
 647   /* Read-read is OK.  */
 648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 649     return false;
 650
 651   /* If dra and drb are part of the same interleaving chain consider
 652      them independent.  */
 653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 655           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 656     return false;
 657
 658   /* Unknown data dependence.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 660     {
 661       if  (dump_enabled_p ())
 662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 663                          "can't determine dependence between %T and %T\n",
 664                          DR_REF (dra), DR_REF (drb));
 665     }
 666   else if (dump_enabled_p ())
 667     dump_printf_loc (MSG_NOTE, vect_location,
 668                      "determined dependence between %T and %T\n",
 669                      DR_REF (dra), DR_REF (drb));
 670
 671   return true;
 672 }
 673
 674
 675 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 676    contain the vector of scalar stores of this instance if we are
 677    disambiguating the loads.  */
 678
 679 static bool
 680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 681                                    vec<stmt_vec_info> stores,
 682                                    stmt_vec_info last_store_info)
 683 {
 684   /* This walks over all stmts involved in the SLP load/store done
 685      in NODE verifying we can sink them up to the last stmt in the
 686      group.  */
 687   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 688     {
 689       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 690       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 691         {
 692           stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 693           if (access_info == last_access_info)
 694             continue;
 695           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 696           ao_ref ref;
 697           bool ref_initialized_p = false;
 698           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 699                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 700             {
 701               gimple *stmt = gsi_stmt (gsi);
 702               if (! gimple_vuse (stmt))
 703                 continue;
 704
 705               /* If we couldn't record a (single) data reference for this
 706                  stmt we have to resort to the alias oracle.  */
 707               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 708               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 709               if (!dr_b)
 710                 {
 711                   /* We are moving a store - this means
 712                      we cannot use TBAA for disambiguation.  */
 713                   if (!ref_initialized_p)
 714                     ao_ref_init (&ref, DR_REF (dr_a));
 715                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 716                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 717                     return false;
 718                   continue;
 719                 }
 720
 721               bool dependent = false;
 722               /* If we run into a store of this same instance (we've just
 723                  marked those) then delay dependence checking until we run
 724                  into the last store because this is where it will have
 725                  been sunk to (and we verify if we can do that as well).  */
 726               if (gimple_visited_p (stmt))
 727                 {
 728                   if (stmt_info != last_store_info)
 729                     continue;
 730                   unsigned i;
 731                   stmt_vec_info store_info;
 732                   FOR_EACH_VEC_ELT (stores, i, store_info)
 733                     {
 734                       data_reference *store_dr
 735                         = STMT_VINFO_DATA_REF (store_info);
 736                       ddr_p ddr = initialize_data_dependence_relation
 737                                     (dr_a, store_dr, vNULL);
 738                       dependent
 739                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740                       free_dependence_relation (ddr);
 741                       if (dependent)
 742                         break;
 743                     }
 744                 }
 745               else
 746                 {
 747                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 748                                                                    dr_b, vNULL);
 749                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 750                   free_dependence_relation (ddr);
 751                 }
 752               if (dependent)
 753                 return false;
 754             }
 755         }
 756     }
 757   else /* DR_IS_READ */
 758     {
 759       stmt_vec_info first_access_info
 760         = vect_find_first_scalar_stmt_in_slp (node);
 761       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 762         {
 763           stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 764           if (access_info == first_access_info)
 765             continue;
 766           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 767           ao_ref ref;
 768           bool ref_initialized_p = false;
 769           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 770                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 771             {
 772               gimple *stmt = gsi_stmt (gsi);
 773               if (! gimple_vdef (stmt))
 774                 continue;
 775
 776               /* If we couldn't record a (single) data reference for this
 777                  stmt we have to resort to the alias oracle.  */
 778               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 779               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 780               if (!dr_b)
 781                 {
 782                   /* We are hoisting a load - this means we can use
 783                      TBAA for disambiguation.  */
 784                   if (!ref_initialized_p)
 785                     ao_ref_init (&ref, DR_REF (dr_a));
 786                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 787                     return false;
 788                   continue;
 789                 }
 790
 791               bool dependent = false;
 792               /* If we run into a store of this same instance (we've just
 793                  marked those) then delay dependence checking until we run
 794                  into the last store because this is where it will have
 795                  been sunk to (and we verify if we can do that as well).  */
 796               if (gimple_visited_p (stmt))
 797                 {
 798                   if (stmt_info != last_store_info)
 799                     continue;
 800                   unsigned i;
 801                   stmt_vec_info store_info;
 802                   FOR_EACH_VEC_ELT (stores, i, store_info)
 803                     {
 804                       data_reference *store_dr
 805                         = STMT_VINFO_DATA_REF (store_info);
 806                       ddr_p ddr = initialize_data_dependence_relation
 807                                     (dr_a, store_dr, vNULL);
 808                       dependent
 809                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 810                       free_dependence_relation (ddr);
 811                       if (dependent)
 812                         break;
 813                     }
 814                 }
 815               else
 816                 {
 817                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 818                                                                    dr_b, vNULL);
 819                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 820                   free_dependence_relation (ddr);
 821                 }
 822               if (dependent)
 823                 return false;
 824             }
 825         }
 826     }
 827   return true;
 828 }
 829
 830
 831 /* Function vect_analyze_data_ref_dependences.
 832
 833    Examine all the data references in the basic-block, and make sure there
 834    do not exist any data dependences between them.  Set *MAX_VF according to
 835    the maximum vectorization factor the data dependences allow.  */
 836
 837 bool
 838 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 839 {
 840   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 841
 842   /* The stores of this instance are at the root of the SLP tree.  */
 843   slp_tree store = SLP_INSTANCE_TREE (instance);
 844   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 845     store = NULL;
 846
 847   /* Verify we can sink stores to the vectorized stmt insert location.  */
 848   stmt_vec_info last_store_info = NULL;
 849   if (store)
 850     {
 851       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 852         return false;
 853
 854       /* Mark stores in this instance and remember the last one.  */
 855       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 856       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 857         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 858     }
 859
 860   bool res = true;
 861
 862   /* Verify we can sink loads to the vectorized stmt insert location,
 863      special-casing stores of this instance.  */
 864   slp_tree load;
 865   unsigned int i;
 866   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 867     if (! vect_slp_analyze_node_dependences (vinfo, load,
 868                                              store
 869                                              ? SLP_TREE_SCALAR_STMTS (store)
 870                                              : vNULL, last_store_info))
 871       {
 872         res = false;
 873         break;
 874       }
 875
 876   /* Unset the visited flag.  */
 877   if (store)
 878     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 879       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 880
 881   return res;
 882 }
 883
 884 /* Record the base alignment guarantee given by DRB, which occurs
 885    in STMT_INFO.  */
 886
 887 static void
 888 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 889                             innermost_loop_behavior *drb)
 890 {
 891   bool existed;
 892   innermost_loop_behavior *&entry
 893     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 894   if (!existed || entry->base_alignment < drb->base_alignment)
 895     {
 896       entry = drb;
 897       if (dump_enabled_p ())
 898         dump_printf_loc (MSG_NOTE, vect_location,
 899                          "recording new base alignment for %T\n"
 900                          "  alignment:    %d\n"
 901                          "  misalignment: %d\n"
 902                          "  based on:     %G",
 903                          drb->base_address,
 904                          drb->base_alignment,
 905                          drb->base_misalignment,
 906                          stmt_info->stmt);
 907     }
 908 }
 909
 910 /* If the region we're going to vectorize is reached, all unconditional
 911    data references occur at least once.  We can therefore pool the base
 912    alignment guarantees from each unconditional reference.  Do this by
 913    going through all the data references in VINFO and checking whether
 914    the containing statement makes the reference unconditionally.  If so,
 915    record the alignment of the base address in VINFO so that it can be
 916    used for all other references with the same base.  */
 917
 918 void
 919 vect_record_base_alignments (vec_info *vinfo)
 920 {
 921   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 922   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 923   data_reference *dr;
 924   unsigned int i;
 925   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 926     {
 927       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 928       stmt_vec_info stmt_info = dr_info->stmt;
 929       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 930           && STMT_VINFO_VECTORIZABLE (stmt_info)
 931           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 932         {
 933           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 934
 935           /* If DR is nested in the loop that is being vectorized, we can also
 936              record the alignment of the base wrt the outer loop.  */
 937           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 938             vect_record_base_alignment
 939               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 940         }
 941     }
 942 }
 943
 944 /* Return the target alignment for the vectorized form of DR_INFO.  */
 945
 946 static poly_uint64
 947 vect_calculate_target_alignment (dr_vec_info *dr_info)
 948 {
 949   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 950   return targetm.vectorize.preferred_vector_alignment (vectype);
 951 }
 952
 953 /* Function vect_compute_data_ref_alignment
 954
 955    Compute the misalignment of the data reference DR_INFO.
 956
 957    Output:
 958    1. DR_MISALIGNMENT (DR_INFO) is defined.
 959
 960    FOR NOW: No analysis is actually performed. Misalignment is calculated
 961    only for trivial cases. TODO.  */
 962
 963 static void
 964 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 965 {
 966   stmt_vec_info stmt_info = dr_info->stmt;
 967   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 968   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 969   class loop *loop = NULL;
 970   tree ref = DR_REF (dr_info->dr);
 971   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 972
 973   if (dump_enabled_p ())
 974     dump_printf_loc (MSG_NOTE, vect_location,
 975                      "vect_compute_data_ref_alignment:\n");
 976
 977   if (loop_vinfo)
 978     loop = LOOP_VINFO_LOOP (loop_vinfo);
 979
 980   /* Initialize misalignment to unknown.  */
 981   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 982
 983   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 984     return;
 985
 986   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 987   bool step_preserves_misalignment_p;
 988
 989   poly_uint64 vector_alignment
 990     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 991   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 992
 993   /* If the main loop has peeled for alignment we have no way of knowing
 994      whether the data accesses in the epilogues are aligned.  We can't at
 995      compile time answer the question whether we have entered the main loop or
 996      not.  Fixes PR 92351.  */
 997   if (loop_vinfo)
 998     {
 999       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1000       if (orig_loop_vinfo
1001           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1002         return;
1003     }
1004
1005   unsigned HOST_WIDE_INT vect_align_c;
1006   if (!vector_alignment.is_constant (&vect_align_c))
1007     return;
1008
1009   /* No step for BB vectorization.  */
1010   if (!loop)
1011     {
1012       gcc_assert (integer_zerop (drb->step));
1013       step_preserves_misalignment_p = true;
1014     }
1015
1016   /* In case the dataref is in an inner-loop of the loop that is being
1017      vectorized (LOOP), we use the base and misalignment information
1018      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1019      stays the same throughout the execution of the inner-loop, which is why
1020      we have to check that the stride of the dataref in the inner-loop evenly
1021      divides by the vector alignment.  */
1022   else if (nested_in_vect_loop_p (loop, stmt_info))
1023     {
1024       step_preserves_misalignment_p
1025         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1026
1027       if (dump_enabled_p ())
1028         {
1029           if (step_preserves_misalignment_p)
1030             dump_printf_loc (MSG_NOTE, vect_location,
1031                              "inner step divides the vector alignment.\n");
1032           else
1033             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034                              "inner step doesn't divide the vector"
1035                              " alignment.\n");
1036         }
1037     }
1038
1039   /* Similarly we can only use base and misalignment information relative to
1040      an innermost loop if the misalignment stays the same throughout the
1041      execution of the loop.  As above, this is the case if the stride of
1042      the dataref evenly divides by the alignment.  */
1043   else
1044     {
1045       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1046       step_preserves_misalignment_p
1047         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1048
1049       if (!step_preserves_misalignment_p && dump_enabled_p ())
1050         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051                          "step doesn't divide the vector alignment.\n");
1052     }
1053
1054   unsigned int base_alignment = drb->base_alignment;
1055   unsigned int base_misalignment = drb->base_misalignment;
1056
1057   /* Calculate the maximum of the pooled base address alignment and the
1058      alignment that we can compute for DR itself.  */
1059   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1060   if (entry && base_alignment < (*entry)->base_alignment)
1061     {
1062       base_alignment = (*entry)->base_alignment;
1063       base_misalignment = (*entry)->base_misalignment;
1064     }
1065
1066   if (drb->offset_alignment < vect_align_c
1067       || !step_preserves_misalignment_p
1068       /* We need to know whether the step wrt the vectorized loop is
1069          negative when computing the starting misalignment below.  */
1070       || TREE_CODE (drb->step) != INTEGER_CST)
1071     {
1072       if (dump_enabled_p ())
1073         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074                          "Unknown alignment for access: %T\n", ref);
1075       return;
1076     }
1077
1078   if (base_alignment < vect_align_c)
1079     {
1080       unsigned int max_alignment;
1081       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1082       if (max_alignment < vect_align_c
1083           || !vect_can_force_dr_alignment_p (base,
1084                                              vect_align_c * BITS_PER_UNIT))
1085         {
1086           if (dump_enabled_p ())
1087             dump_printf_loc (MSG_NOTE, vect_location,
1088                              "can't force alignment of ref: %T\n", ref);
1089           return;
1090         }
1091
1092       /* Force the alignment of the decl.
1093          NOTE: This is the only change to the code we make during
1094          the analysis phase, before deciding to vectorize the loop.  */
1095       if (dump_enabled_p ())
1096         dump_printf_loc (MSG_NOTE, vect_location,
1097                          "force alignment of %T\n", ref);
1098
1099       dr_info->base_decl = base;
1100       dr_info->base_misaligned = true;
1101       base_misalignment = 0;
1102     }
1103   poly_int64 misalignment
1104     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1105
1106   /* If this is a backward running DR then first access in the larger
1107      vectype actually is N-1 elements before the address in the DR.
1108      Adjust misalign accordingly.  */
1109   if (tree_int_cst_sgn (drb->step) < 0)
1110     /* PLUS because STEP is negative.  */
1111     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1112                      * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1113
1114   unsigned int const_misalignment;
1115   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1116     {
1117       if (dump_enabled_p ())
1118         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119                          "Non-constant misalignment for access: %T\n", ref);
1120       return;
1121     }
1122
1123   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1124
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                      "misalign = %d bytes of ref %T\n",
1128                      DR_MISALIGNMENT (dr_info), ref);
1129
1130   return;
1131 }
1132
1133 /* Function vect_update_misalignment_for_peel.
1134    Sets DR_INFO's misalignment
1135    - to 0 if it has the same alignment as DR_PEEL_INFO,
1136    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1137    - to -1 (unknown) otherwise.
1138
1139    DR_INFO - the data reference whose misalignment is to be adjusted.
1140    DR_PEEL_INFO - the data reference whose misalignment is being made
1141                   zero in the vector loop by the peel.
1142    NPEEL - the number of iterations in the peel loop if the misalignment
1143            of DR_PEEL_INFO is known at compile time.  */
1144
1145 static void
1146 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1147                                    dr_vec_info *dr_peel_info, int npeel)
1148 {
1149   unsigned int i;
1150   vec<dr_p> same_aligned_drs;
1151   struct data_reference *current_dr;
1152   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1153
1154   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1155      it is aligned in the vector loop.  */
1156   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1157   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1158     {
1159       if (current_dr != dr_info->dr)
1160         continue;
1161       gcc_assert (!known_alignment_for_access_p (dr_info)
1162                   || !known_alignment_for_access_p (dr_peel_info)
1163                   || (DR_MISALIGNMENT (dr_info)
1164                       == DR_MISALIGNMENT (dr_peel_info)));
1165       SET_DR_MISALIGNMENT (dr_info, 0);
1166       return;
1167     }
1168
1169   unsigned HOST_WIDE_INT alignment;
1170   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1171       && known_alignment_for_access_p (dr_info)
1172       && known_alignment_for_access_p (dr_peel_info))
1173     {
1174       int misal = DR_MISALIGNMENT (dr_info);
1175       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1176       misal &= alignment - 1;
1177       SET_DR_MISALIGNMENT (dr_info, misal);
1178       return;
1179     }
1180
1181   if (dump_enabled_p ())
1182     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1183                      "to unknown (-1).\n");
1184   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1185 }
1186
1187 /* Return true if alignment is relevant for DR_INFO.  */
1188
1189 static bool
1190 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1191 {
1192   stmt_vec_info stmt_info = dr_info->stmt;
1193
1194   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1195     return false;
1196
1197   /* For interleaving, only the alignment of the first access matters.  */
1198   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1199       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1200     return false;
1201
1202   /* Scatter-gather and invariant accesses continue to address individual
1203      scalars, so vector-level alignment is irrelevant.  */
1204   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1205       || integer_zerop (DR_STEP (dr_info->dr)))
1206     return false;
1207
1208   /* Strided accesses perform only component accesses, alignment is
1209      irrelevant for them.  */
1210   if (STMT_VINFO_STRIDED_P (stmt_info)
1211       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     return false;
1213
1214   return true;
1215 }
1216
1217 /* Given an memory reference EXP return whether its alignment is less
1218    than its size.  */
1219
1220 static bool
1221 not_size_aligned (tree exp)
1222 {
1223   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1224     return true;
1225
1226   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1227           > get_object_alignment (exp));
1228 }
1229
1230 /* Function vector_alignment_reachable_p
1231
1232    Return true if vector alignment for DR_INFO is reachable by peeling
1233    a few loop iterations.  Return false otherwise.  */
1234
1235 static bool
1236 vector_alignment_reachable_p (dr_vec_info *dr_info)
1237 {
1238   stmt_vec_info stmt_info = dr_info->stmt;
1239   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240
1241   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242     {
1243       /* For interleaved access we peel only if number of iterations in
1244          the prolog loop ({VF - misalignment}), is a multiple of the
1245          number of the interleaved accesses.  */
1246       int elem_size, mis_in_elements;
1247
1248       /* FORNOW: handle only known alignment.  */
1249       if (!known_alignment_for_access_p (dr_info))
1250         return false;
1251
1252       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1253       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1254       elem_size = vector_element_size (vector_size, nelements);
1255       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1256
1257       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1258         return false;
1259     }
1260
1261   /* If misalignment is known at the compile time then allow peeling
1262      only if natural alignment is reachable through peeling.  */
1263   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1264     {
1265       HOST_WIDE_INT elmsize =
1266                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1267       if (dump_enabled_p ())
1268         {
1269           dump_printf_loc (MSG_NOTE, vect_location,
1270                            "data size = %wd. misalignment = %d.\n", elmsize,
1271                            DR_MISALIGNMENT (dr_info));
1272         }
1273       if (DR_MISALIGNMENT (dr_info) % elmsize)
1274         {
1275           if (dump_enabled_p ())
1276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                              "data size does not divide the misalignment.\n");
1278           return false;
1279         }
1280     }
1281
1282   if (!known_alignment_for_access_p (dr_info))
1283     {
1284       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1285       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1286       if (dump_enabled_p ())
1287         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                          "Unknown misalignment, %snaturally aligned\n",
1289                          is_packed ? "not " : "");
1290       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1291     }
1292
1293   return true;
1294 }
1295
1296
1297 /* Calculate the cost of the memory access represented by DR_INFO.  */
1298
1299 static void
1300 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1301                            unsigned int *inside_cost,
1302                            unsigned int *outside_cost,
1303                            stmt_vector_for_cost *body_cost_vec,
1304                            stmt_vector_for_cost *prologue_cost_vec)
1305 {
1306   stmt_vec_info stmt_info = dr_info->stmt;
1307   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1308   int ncopies;
1309
1310   if (PURE_SLP_STMT (stmt_info))
1311     ncopies = 1;
1312   else
1313     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1314
1315   if (DR_IS_READ (dr_info->dr))
1316     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1317                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1318   else
1319     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1320
1321   if (dump_enabled_p ())
1322     dump_printf_loc (MSG_NOTE, vect_location,
1323                      "vect_get_data_access_cost: inside_cost = %d, "
1324                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1325 }
1326
1327
1328 typedef struct _vect_peel_info
1329 {
1330   dr_vec_info *dr_info;
1331   int npeel;
1332   unsigned int count;
1333 } *vect_peel_info;
1334
1335 typedef struct _vect_peel_extended_info
1336 {
1337   vec_info *vinfo;
1338   struct _vect_peel_info peel_info;
1339   unsigned int inside_cost;
1340   unsigned int outside_cost;
1341 } *vect_peel_extended_info;
1342
1343
1344 /* Peeling hashtable helpers.  */
1345
1346 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1347 {
1348   static inline hashval_t hash (const _vect_peel_info *);
1349   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1350 };
1351
1352 inline hashval_t
1353 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1354 {
1355   return (hashval_t) peel_info->npeel;
1356 }
1357
1358 inline bool
1359 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1360 {
1361   return (a->npeel == b->npeel);
1362 }
1363
1364
1365 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1366
1367 static void
1368 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1369                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1370                           int npeel)
1371 {
1372   struct _vect_peel_info elem, *slot;
1373   _vect_peel_info **new_slot;
1374   bool supportable_dr_alignment
1375     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1376
1377   elem.npeel = npeel;
1378   slot = peeling_htab->find (&elem);
1379   if (slot)
1380     slot->count++;
1381   else
1382     {
1383       slot = XNEW (struct _vect_peel_info);
1384       slot->npeel = npeel;
1385       slot->dr_info = dr_info;
1386       slot->count = 1;
1387       new_slot = peeling_htab->find_slot (slot, INSERT);
1388       *new_slot = slot;
1389     }
1390
1391   if (!supportable_dr_alignment
1392       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1393     slot->count += VECT_MAX_COST;
1394 }
1395
1396
1397 /* Traverse peeling hash table to find peeling option that aligns maximum
1398    number of data accesses.  */
1399
1400 int
1401 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1402                                      _vect_peel_extended_info *max)
1403 {
1404   vect_peel_info elem = *slot;
1405
1406   if (elem->count > max->peel_info.count
1407       || (elem->count == max->peel_info.count
1408           && max->peel_info.npeel > elem->npeel))
1409     {
1410       max->peel_info.npeel = elem->npeel;
1411       max->peel_info.count = elem->count;
1412       max->peel_info.dr_info = elem->dr_info;
1413     }
1414
1415   return 1;
1416 }
1417
1418 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1419    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1420    we assume DR0_INFO's misalignment will be zero after peeling.  */
1421
1422 static void
1423 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1424                                 dr_vec_info *dr0_info,
1425                                 unsigned int *inside_cost,
1426                                 unsigned int *outside_cost,
1427                                 stmt_vector_for_cost *body_cost_vec,
1428                                 stmt_vector_for_cost *prologue_cost_vec,
1429                                 unsigned int npeel,
1430                                 bool unknown_misalignment)
1431 {
1432   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1433   unsigned i;
1434   data_reference *dr;
1435
1436   FOR_EACH_VEC_ELT (datarefs, i, dr)
1437     {
1438       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1439       if (!vect_relevant_for_alignment_p (dr_info))
1440         continue;
1441
1442       int save_misalignment;
1443       save_misalignment = DR_MISALIGNMENT (dr_info);
1444       if (npeel == 0)
1445         ;
1446       else if (unknown_misalignment && dr_info == dr0_info)
1447         SET_DR_MISALIGNMENT (dr_info, 0);
1448       else
1449         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1450       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1451                                  body_cost_vec, prologue_cost_vec);
1452       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1453     }
1454 }
1455
1456 /* Traverse peeling hash table and calculate cost for each peeling option.
1457    Find the one with the lowest cost.  */
1458
1459 int
1460 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1461                                    _vect_peel_extended_info *min)
1462 {
1463   vect_peel_info elem = *slot;
1464   int dummy;
1465   unsigned int inside_cost = 0, outside_cost = 0;
1466   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1467   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1468                        epilogue_cost_vec;
1469
1470   prologue_cost_vec.create (2);
1471   body_cost_vec.create (2);
1472   epilogue_cost_vec.create (2);
1473
1474   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1475                                   &outside_cost, &body_cost_vec,
1476                                   &prologue_cost_vec, elem->npeel, false);
1477
1478   body_cost_vec.release ();
1479
1480   outside_cost += vect_get_known_peeling_cost
1481     (loop_vinfo, elem->npeel, &dummy,
1482      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1483      &prologue_cost_vec, &epilogue_cost_vec);
1484
1485   /* Prologue and epilogue costs are added to the target model later.
1486      These costs depend only on the scalar iteration cost, the
1487      number of peeling iterations finally chosen, and the number of
1488      misaligned statements.  So discard the information found here.  */
1489   prologue_cost_vec.release ();
1490   epilogue_cost_vec.release ();
1491
1492   if (inside_cost < min->inside_cost
1493       || (inside_cost == min->inside_cost
1494           && outside_cost < min->outside_cost))
1495     {
1496       min->inside_cost = inside_cost;
1497       min->outside_cost = outside_cost;
1498       min->peel_info.dr_info = elem->dr_info;
1499       min->peel_info.npeel = elem->npeel;
1500       min->peel_info.count = elem->count;
1501     }
1502
1503   return 1;
1504 }
1505
1506
1507 /* Choose best peeling option by traversing peeling hash table and either
1508    choosing an option with the lowest cost (if cost model is enabled) or the
1509    option that aligns as many accesses as possible.  */
1510
1511 static struct _vect_peel_extended_info
1512 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1513                                        loop_vec_info loop_vinfo)
1514 {
1515    struct _vect_peel_extended_info res;
1516
1517    res.peel_info.dr_info = NULL;
1518    res.vinfo = loop_vinfo;
1519
1520    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1521      {
1522        res.inside_cost = INT_MAX;
1523        res.outside_cost = INT_MAX;
1524        peeling_htab->traverse <_vect_peel_extended_info *,
1525                                vect_peeling_hash_get_lowest_cost> (&res);
1526      }
1527    else
1528      {
1529        res.peel_info.count = 0;
1530        peeling_htab->traverse <_vect_peel_extended_info *,
1531                                vect_peeling_hash_get_most_frequent> (&res);
1532        res.inside_cost = 0;
1533        res.outside_cost = 0;
1534      }
1535
1536    return res;
1537 }
1538
1539 /* Return true if the new peeling NPEEL is supported.  */
1540
1541 static bool
1542 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1543                           unsigned npeel)
1544 {
1545   unsigned i;
1546   struct data_reference *dr = NULL;
1547   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1548   enum dr_alignment_support supportable_dr_alignment;
1549
1550   /* Ensure that all data refs can be vectorized after the peel.  */
1551   FOR_EACH_VEC_ELT (datarefs, i, dr)
1552     {
1553       int save_misalignment;
1554
1555       if (dr == dr0_info->dr)
1556         continue;
1557
1558       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1559       if (!vect_relevant_for_alignment_p (dr_info))
1560         continue;
1561
1562       save_misalignment = DR_MISALIGNMENT (dr_info);
1563       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1564       supportable_dr_alignment
1565         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1566       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1567
1568       if (!supportable_dr_alignment)
1569         return false;
1570     }
1571
1572   return true;
1573 }
1574
1575 /* Function vect_enhance_data_refs_alignment
1576
1577    This pass will use loop versioning and loop peeling in order to enhance
1578    the alignment of data references in the loop.
1579
1580    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1581    original loop is to be vectorized.  Any other loops that are created by
1582    the transformations performed in this pass - are not supposed to be
1583    vectorized.  This restriction will be relaxed.
1584
1585    This pass will require a cost model to guide it whether to apply peeling
1586    or versioning or a combination of the two.  For example, the scheme that
1587    intel uses when given a loop with several memory accesses, is as follows:
1588    choose one memory access ('p') which alignment you want to force by doing
1589    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1590    other accesses are not necessarily aligned, or (2) use loop versioning to
1591    generate one loop in which all accesses are aligned, and another loop in
1592    which only 'p' is necessarily aligned.
1593
1594    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1595    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1596    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1597
1598    Devising a cost model is the most critical aspect of this work.  It will
1599    guide us on which access to peel for, whether to use loop versioning, how
1600    many versions to create, etc.  The cost model will probably consist of
1601    generic considerations as well as target specific considerations (on
1602    powerpc for example, misaligned stores are more painful than misaligned
1603    loads).
1604
1605    Here are the general steps involved in alignment enhancements:
1606
1607      -- original loop, before alignment analysis:
1608         for (i=0; i<N; i++){
1609           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1610           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1611         }
1612
1613      -- After vect_compute_data_refs_alignment:
1614         for (i=0; i<N; i++){
1615           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1616           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1617         }
1618
1619      -- Possibility 1: we do loop versioning:
1620      if (p is aligned) {
1621         for (i=0; i<N; i++){    # loop 1A
1622           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1623           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1624         }
1625      }
1626      else {
1627         for (i=0; i<N; i++){    # loop 1B
1628           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1629           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1630         }
1631      }
1632
1633      -- Possibility 2: we do loop peeling:
1634      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1635         x = q[i];
1636         p[i] = y;
1637      }
1638      for (i = 3; i < N; i++){   # loop 2A
1639         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1640         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1641      }
1642
1643      -- Possibility 3: combination of loop peeling and versioning:
1644      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1645         x = q[i];
1646         p[i] = y;
1647      }
1648      if (p is aligned) {
1649         for (i = 3; i<N; i++){  # loop 3A
1650           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1651           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1652         }
1653      }
1654      else {
1655         for (i = 3; i<N; i++){  # loop 3B
1656           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1657           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1658         }
1659      }
1660
1661      These loops are later passed to loop_transform to be vectorized.  The
1662      vectorizer will use the alignment information to guide the transformation
1663      (whether to generate regular loads/stores, or with special handling for
1664      misalignment).  */
1665
1666 opt_result
1667 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1668 {
1669   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1670   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1671   enum dr_alignment_support supportable_dr_alignment;
1672   dr_vec_info *first_store = NULL;
1673   dr_vec_info *dr0_info = NULL;
1674   struct data_reference *dr;
1675   unsigned int i;
1676   bool do_peeling = false;
1677   bool do_versioning = false;
1678   unsigned int npeel = 0;
1679   bool one_misalignment_known = false;
1680   bool one_misalignment_unknown = false;
1681   bool one_dr_unsupportable = false;
1682   dr_vec_info *unsupportable_dr_info = NULL;
1683   unsigned int mis, same_align_drs_max = 0;
1684   hash_table<peel_info_hasher> peeling_htab (1);
1685
1686   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1687
1688   /* Reset data so we can safely be called multiple times.  */
1689   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1690   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1691
1692   /* While cost model enhancements are expected in the future, the high level
1693      view of the code at this time is as follows:
1694
1695      A) If there is a misaligned access then see if peeling to align
1696         this access can make all data references satisfy
1697         vect_supportable_dr_alignment.  If so, update data structures
1698         as needed and return true.
1699
1700      B) If peeling wasn't possible and there is a data reference with an
1701         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1702         then see if loop versioning checks can be used to make all data
1703         references satisfy vect_supportable_dr_alignment.  If so, update
1704         data structures as needed and return true.
1705
1706      C) If neither peeling nor versioning were successful then return false if
1707         any data reference does not satisfy vect_supportable_dr_alignment.
1708
1709      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1710
1711      Note, Possibility 3 above (which is peeling and versioning together) is not
1712      being done at this time.  */
1713
1714   /* (1) Peeling to force alignment.  */
1715
1716   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1717      Considerations:
1718      + How many accesses will become aligned due to the peeling
1719      - How many accesses will become unaligned due to the peeling,
1720        and the cost of misaligned accesses.
1721      - The cost of peeling (the extra runtime checks, the increase
1722        in code size).  */
1723
1724   FOR_EACH_VEC_ELT (datarefs, i, dr)
1725     {
1726       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1727       if (!vect_relevant_for_alignment_p (dr_info))
1728         continue;
1729
1730       stmt_vec_info stmt_info = dr_info->stmt;
1731       supportable_dr_alignment
1732         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1733       do_peeling = vector_alignment_reachable_p (dr_info);
1734       if (do_peeling)
1735         {
1736           if (known_alignment_for_access_p (dr_info))
1737             {
1738               unsigned int npeel_tmp = 0;
1739               bool negative = tree_int_cst_compare (DR_STEP (dr),
1740                                                     size_zero_node) < 0;
1741
1742               /* If known_alignment_for_access_p then we have set
1743                  DR_MISALIGNMENT which is only done if we know it at compiler
1744                  time, so it is safe to assume target alignment is constant.
1745                */
1746               unsigned int target_align =
1747                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1748               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1749               mis = (negative
1750                      ? DR_MISALIGNMENT (dr_info)
1751                      : -DR_MISALIGNMENT (dr_info));
1752               if (DR_MISALIGNMENT (dr_info) != 0)
1753                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1754
1755               /* For multiple types, it is possible that the bigger type access
1756                  will have more than one peeling option.  E.g., a loop with two
1757                  types: one of size (vector size / 4), and the other one of
1758                  size (vector size / 8).  Vectorization factor will 8.  If both
1759                  accesses are misaligned by 3, the first one needs one scalar
1760                  iteration to be aligned, and the second one needs 5.  But the
1761                  first one will be aligned also by peeling 5 scalar
1762                  iterations, and in that case both accesses will be aligned.
1763                  Hence, except for the immediate peeling amount, we also want
1764                  to try to add full vector size, while we don't exceed
1765                  vectorization factor.
1766                  We do this automatically for cost model, since we calculate
1767                  cost for every peeling option.  */
1768               poly_uint64 nscalars = npeel_tmp;
1769               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1770                 {
1771                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1772                   nscalars = (STMT_SLP_TYPE (stmt_info)
1773                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1774                 }
1775
1776               /* Save info about DR in the hash table.  Also include peeling
1777                  amounts according to the explanation above.  */
1778               while (known_le (npeel_tmp, nscalars))
1779                 {
1780                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1781                                             dr_info, npeel_tmp);
1782                   npeel_tmp += MAX (1, target_align / dr_size);
1783                 }
1784
1785               one_misalignment_known = true;
1786             }
1787           else
1788             {
1789               /* If we don't know any misalignment values, we prefer
1790                  peeling for data-ref that has the maximum number of data-refs
1791                  with the same alignment, unless the target prefers to align
1792                  stores over load.  */
1793               unsigned same_align_drs
1794                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1795               if (!dr0_info
1796                   || same_align_drs_max < same_align_drs)
1797                 {
1798                   same_align_drs_max = same_align_drs;
1799                   dr0_info = dr_info;
1800                 }
1801               /* For data-refs with the same number of related
1802                  accesses prefer the one where the misalign
1803                  computation will be invariant in the outermost loop.  */
1804               else if (same_align_drs_max == same_align_drs)
1805                 {
1806                   class loop *ivloop0, *ivloop;
1807                   ivloop0 = outermost_invariant_loop_for_expr
1808                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1809                   ivloop = outermost_invariant_loop_for_expr
1810                     (loop, DR_BASE_ADDRESS (dr));
1811                   if ((ivloop && !ivloop0)
1812                       || (ivloop && ivloop0
1813                           && flow_loop_nested_p (ivloop, ivloop0)))
1814                     dr0_info = dr_info;
1815                 }
1816
1817               one_misalignment_unknown = true;
1818
1819               /* Check for data refs with unsupportable alignment that
1820                  can be peeled.  */
1821               if (!supportable_dr_alignment)
1822               {
1823                 one_dr_unsupportable = true;
1824                 unsupportable_dr_info = dr_info;
1825               }
1826
1827               if (!first_store && DR_IS_WRITE (dr))
1828                 first_store = dr_info;
1829             }
1830         }
1831       else
1832         {
1833           if (!aligned_access_p (dr_info))
1834             {
1835               if (dump_enabled_p ())
1836                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837                                  "vector alignment may not be reachable\n");
1838               break;
1839             }
1840         }
1841     }
1842
1843   /* Check if we can possibly peel the loop.  */
1844   if (!vect_can_advance_ivs_p (loop_vinfo)
1845       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1846       || loop->inner)
1847     do_peeling = false;
1848
1849   struct _vect_peel_extended_info peel_for_known_alignment;
1850   struct _vect_peel_extended_info peel_for_unknown_alignment;
1851   struct _vect_peel_extended_info best_peel;
1852
1853   peel_for_unknown_alignment.inside_cost = INT_MAX;
1854   peel_for_unknown_alignment.outside_cost = INT_MAX;
1855   peel_for_unknown_alignment.peel_info.count = 0;
1856
1857   if (do_peeling
1858       && one_misalignment_unknown)
1859     {
1860       /* Check if the target requires to prefer stores over loads, i.e., if
1861          misaligned stores are more expensive than misaligned loads (taking
1862          drs with same alignment into account).  */
1863       unsigned int load_inside_cost = 0;
1864       unsigned int load_outside_cost = 0;
1865       unsigned int store_inside_cost = 0;
1866       unsigned int store_outside_cost = 0;
1867       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1868
1869       stmt_vector_for_cost dummy;
1870       dummy.create (2);
1871       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1872                                       &load_inside_cost,
1873                                       &load_outside_cost,
1874                                       &dummy, &dummy, estimated_npeels, true);
1875       dummy.release ();
1876
1877       if (first_store)
1878         {
1879           dummy.create (2);
1880           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1881                                           &store_inside_cost,
1882                                           &store_outside_cost,
1883                                           &dummy, &dummy,
1884                                           estimated_npeels, true);
1885           dummy.release ();
1886         }
1887       else
1888         {
1889           store_inside_cost = INT_MAX;
1890           store_outside_cost = INT_MAX;
1891         }
1892
1893       if (load_inside_cost > store_inside_cost
1894           || (load_inside_cost == store_inside_cost
1895               && load_outside_cost > store_outside_cost))
1896         {
1897           dr0_info = first_store;
1898           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1899           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1900         }
1901       else
1902         {
1903           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1904           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1905         }
1906
1907       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1908       prologue_cost_vec.create (2);
1909       epilogue_cost_vec.create (2);
1910
1911       int dummy2;
1912       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1913         (loop_vinfo, estimated_npeels, &dummy2,
1914          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1915          &prologue_cost_vec, &epilogue_cost_vec);
1916
1917       prologue_cost_vec.release ();
1918       epilogue_cost_vec.release ();
1919
1920       peel_for_unknown_alignment.peel_info.count = 1
1921         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1922     }
1923
1924   peel_for_unknown_alignment.peel_info.npeel = 0;
1925   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1926
1927   best_peel = peel_for_unknown_alignment;
1928
1929   peel_for_known_alignment.inside_cost = INT_MAX;
1930   peel_for_known_alignment.outside_cost = INT_MAX;
1931   peel_for_known_alignment.peel_info.count = 0;
1932   peel_for_known_alignment.peel_info.dr_info = NULL;
1933
1934   if (do_peeling && one_misalignment_known)
1935     {
1936       /* Peeling is possible, but there is no data access that is not supported
1937          unless aligned.  So we try to choose the best possible peeling from
1938          the hash table.  */
1939       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1940         (&peeling_htab, loop_vinfo);
1941     }
1942
1943   /* Compare costs of peeling for known and unknown alignment. */
1944   if (peel_for_known_alignment.peel_info.dr_info != NULL
1945       && peel_for_unknown_alignment.inside_cost
1946       >= peel_for_known_alignment.inside_cost)
1947     {
1948       best_peel = peel_for_known_alignment;
1949
1950       /* If the best peeling for known alignment has NPEEL == 0, perform no
1951          peeling at all except if there is an unsupportable dr that we can
1952          align.  */
1953       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1954         do_peeling = false;
1955     }
1956
1957   /* If there is an unsupportable data ref, prefer this over all choices so far
1958      since we'd have to discard a chosen peeling except when it accidentally
1959      aligned the unsupportable data ref.  */
1960   if (one_dr_unsupportable)
1961     dr0_info = unsupportable_dr_info;
1962   else if (do_peeling)
1963     {
1964       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1965          TODO: Use nopeel_outside_cost or get rid of it?  */
1966       unsigned nopeel_inside_cost = 0;
1967       unsigned nopeel_outside_cost = 0;
1968
1969       stmt_vector_for_cost dummy;
1970       dummy.create (2);
1971       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1972                                       &nopeel_outside_cost, &dummy, &dummy,
1973                                       0, false);
1974       dummy.release ();
1975
1976       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1977          costs will be recorded.  */
1978       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1979       prologue_cost_vec.create (2);
1980       epilogue_cost_vec.create (2);
1981
1982       int dummy2;
1983       nopeel_outside_cost += vect_get_known_peeling_cost
1984         (loop_vinfo, 0, &dummy2,
1985          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1986          &prologue_cost_vec, &epilogue_cost_vec);
1987
1988       prologue_cost_vec.release ();
1989       epilogue_cost_vec.release ();
1990
1991       npeel = best_peel.peel_info.npeel;
1992       dr0_info = best_peel.peel_info.dr_info;
1993
1994       /* If no peeling is not more expensive than the best peeling we
1995          have so far, don't perform any peeling.  */
1996       if (nopeel_inside_cost <= best_peel.inside_cost)
1997         do_peeling = false;
1998     }
1999
2000   if (do_peeling)
2001     {
2002       stmt_vec_info stmt_info = dr0_info->stmt;
2003       if (known_alignment_for_access_p (dr0_info))
2004         {
2005           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2006                                                 size_zero_node) < 0;
2007           if (!npeel)
2008             {
2009               /* Since it's known at compile time, compute the number of
2010                  iterations in the peeled loop (the peeling factor) for use in
2011                  updating DR_MISALIGNMENT values.  The peeling factor is the
2012                  vectorization factor minus the misalignment as an element
2013                  count.  */
2014               mis = (negative
2015                      ? DR_MISALIGNMENT (dr0_info)
2016                      : -DR_MISALIGNMENT (dr0_info));
2017               /* If known_alignment_for_access_p then we have set
2018                  DR_MISALIGNMENT which is only done if we know it at compiler
2019                  time, so it is safe to assume target alignment is constant.
2020                */
2021               unsigned int target_align =
2022                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2023               npeel = ((mis & (target_align - 1))
2024                        / vect_get_scalar_dr_size (dr0_info));
2025             }
2026
2027           /* For interleaved data access every iteration accesses all the
2028              members of the group, therefore we divide the number of iterations
2029              by the group size.  */
2030           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031             npeel /= DR_GROUP_SIZE (stmt_info);
2032
2033           if (dump_enabled_p ())
2034             dump_printf_loc (MSG_NOTE, vect_location,
2035                              "Try peeling by %d\n", npeel);
2036         }
2037
2038       /* Ensure that all datarefs can be vectorized after the peel.  */
2039       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2040         do_peeling = false;
2041
2042       /* Check if all datarefs are supportable and log.  */
2043       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2044         return opt_result::success ();
2045
2046       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2047       if (do_peeling)
2048         {
2049           unsigned max_allowed_peel
2050             = param_vect_max_peeling_for_alignment;
2051           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2052             max_allowed_peel = 0;
2053           if (max_allowed_peel != (unsigned)-1)
2054             {
2055               unsigned max_peel = npeel;
2056               if (max_peel == 0)
2057                 {
2058                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2059                   unsigned HOST_WIDE_INT target_align_c;
2060                   if (target_align.is_constant (&target_align_c))
2061                     max_peel =
2062                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2063                   else
2064                     {
2065                       do_peeling = false;
2066                       if (dump_enabled_p ())
2067                         dump_printf_loc (MSG_NOTE, vect_location,
2068                           "Disable peeling, max peels set and vector"
2069                           " alignment unknown\n");
2070                     }
2071                 }
2072               if (max_peel > max_allowed_peel)
2073                 {
2074                   do_peeling = false;
2075                   if (dump_enabled_p ())
2076                     dump_printf_loc (MSG_NOTE, vect_location,
2077                         "Disable peeling, max peels reached: %d\n", max_peel);
2078                 }
2079             }
2080         }
2081
2082       /* Cost model #2 - if peeling may result in a remaining loop not
2083          iterating enough to be vectorized then do not peel.  Since this
2084          is a cost heuristic rather than a correctness decision, use the
2085          most likely runtime value for variable vectorization factors.  */
2086       if (do_peeling
2087           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2088         {
2089           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2090           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2091           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2092               < assumed_vf + max_peel)
2093             do_peeling = false;
2094         }
2095
2096       if (do_peeling)
2097         {
2098           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2099              If the misalignment of DR_i is identical to that of dr0 then set
2100              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2101              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2102              by the peeling factor times the element size of DR_i (MOD the
2103              vectorization factor times the size).  Otherwise, the
2104              misalignment of DR_i must be set to unknown.  */
2105           FOR_EACH_VEC_ELT (datarefs, i, dr)
2106             if (dr != dr0_info->dr)
2107               {
2108                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2109                 if (!vect_relevant_for_alignment_p (dr_info))
2110                   continue;
2111
2112                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2113               }
2114
2115           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2116           if (npeel)
2117             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2118           else
2119             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2120               = DR_MISALIGNMENT (dr0_info);
2121           SET_DR_MISALIGNMENT (dr0_info, 0);
2122           if (dump_enabled_p ())
2123             {
2124               dump_printf_loc (MSG_NOTE, vect_location,
2125                                "Alignment of access forced using peeling.\n");
2126               dump_printf_loc (MSG_NOTE, vect_location,
2127                                "Peeling for alignment will be applied.\n");
2128             }
2129
2130           /* The inside-loop cost will be accounted for in vectorizable_load
2131              and vectorizable_store correctly with adjusted alignments.
2132              Drop the body_cst_vec on the floor here.  */
2133           return opt_result::success ();
2134         }
2135     }
2136
2137   /* (2) Versioning to force alignment.  */
2138
2139   /* Try versioning if:
2140      1) optimize loop for speed and the cost-model is not cheap
2141      2) there is at least one unsupported misaligned data ref with an unknown
2142         misalignment, and
2143      3) all misaligned data refs with a known misalignment are supported, and
2144      4) the number of runtime alignment checks is within reason.  */
2145
2146   do_versioning
2147     = (optimize_loop_nest_for_speed_p (loop)
2148        && !loop->inner /* FORNOW */
2149        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2150
2151   if (do_versioning)
2152     {
2153       FOR_EACH_VEC_ELT (datarefs, i, dr)
2154         {
2155           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2156           if (aligned_access_p (dr_info)
2157               || !vect_relevant_for_alignment_p (dr_info))
2158             continue;
2159
2160           stmt_vec_info stmt_info = dr_info->stmt;
2161           if (STMT_VINFO_STRIDED_P (stmt_info))
2162             {
2163               do_versioning = false;
2164               break;
2165             }
2166
2167           supportable_dr_alignment
2168             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2169
2170           if (!supportable_dr_alignment)
2171             {
2172               int mask;
2173               tree vectype;
2174
2175               if (known_alignment_for_access_p (dr_info)
2176                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2177                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2178                 {
2179                   do_versioning = false;
2180                   break;
2181                 }
2182
2183               vectype = STMT_VINFO_VECTYPE (stmt_info);
2184               gcc_assert (vectype);
2185
2186               /* At present we don't support versioning for alignment
2187                  with variable VF, since there's no guarantee that the
2188                  VF is a power of two.  We could relax this if we added
2189                  a way of enforcing a power-of-two size.  */
2190               unsigned HOST_WIDE_INT size;
2191               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2192                 {
2193                   do_versioning = false;
2194                   break;
2195                 }
2196
2197               /* Forcing alignment in the first iteration is no good if
2198                  we don't keep it across iterations.  For now, just disable
2199                  versioning in this case.
2200                  ?? We could actually unroll the loop to achieve the required
2201                  overall step alignment, and forcing the alignment could be
2202                  done by doing some iterations of the non-vectorized loop.  */
2203               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2204                                * DR_STEP_ALIGNMENT (dr),
2205                                DR_TARGET_ALIGNMENT (dr_info)))
2206                 {
2207                   do_versioning = false;
2208                   break;
2209                 }
2210
2211               /* The rightmost bits of an aligned address must be zeros.
2212                  Construct the mask needed for this test.  For example,
2213                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2214                  mask must be 15 = 0xf. */
2215               mask = size - 1;
2216
2217               /* FORNOW: use the same mask to test all potentially unaligned
2218                  references in the loop.  */
2219               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2220                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2221                 {
2222                   do_versioning = false;
2223                   break;
2224                 }
2225
2226               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2227               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2228             }
2229         }
2230
2231       /* Versioning requires at least one misaligned data reference.  */
2232       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2233         do_versioning = false;
2234       else if (!do_versioning)
2235         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2236     }
2237
2238   if (do_versioning)
2239     {
2240       vec<stmt_vec_info> may_misalign_stmts
2241         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2242       stmt_vec_info stmt_info;
2243
2244       /* It can now be assumed that the data references in the statements
2245          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2246          of the loop being vectorized.  */
2247       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2248         {
2249           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2250           SET_DR_MISALIGNMENT (dr_info, 0);
2251           if (dump_enabled_p ())
2252             dump_printf_loc (MSG_NOTE, vect_location,
2253                              "Alignment of access forced using versioning.\n");
2254         }
2255
2256       if (dump_enabled_p ())
2257         dump_printf_loc (MSG_NOTE, vect_location,
2258                          "Versioning for alignment will be applied.\n");
2259
2260       /* Peeling and versioning can't be done together at this time.  */
2261       gcc_assert (! (do_peeling && do_versioning));
2262
2263       return opt_result::success ();
2264     }
2265
2266   /* This point is reached if neither peeling nor versioning is being done.  */
2267   gcc_assert (! (do_peeling || do_versioning));
2268
2269   return opt_result::success ();
2270 }
2271
2272
2273 /* Function vect_find_same_alignment_drs.
2274
2275    Update group and alignment relations in VINFO according to the chosen
2276    vectorization factor.  */
2277
2278 static void
2279 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2280 {
2281   struct data_reference *dra = DDR_A (ddr);
2282   struct data_reference *drb = DDR_B (ddr);
2283   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2284   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2285   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2286   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2287
2288   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2289     return;
2290
2291   if (dra == drb)
2292     return;
2293
2294   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2295       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2296     return;
2297
2298   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2299       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2300       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2301     return;
2302
2303   /* Two references with distance zero have the same alignment.  */
2304   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2305                           - wi::to_poly_offset (DR_INIT (drb)));
2306   if (maybe_ne (diff, 0))
2307     {
2308       /* Get the wider of the two alignments.  */
2309       poly_uint64 align_a =
2310         exact_div (vect_calculate_target_alignment (dr_info_a),
2311                    BITS_PER_UNIT);
2312       poly_uint64 align_b =
2313         exact_div (vect_calculate_target_alignment (dr_info_b),
2314                    BITS_PER_UNIT);
2315       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2316       if (!align_a.is_constant (&align_a_c)
2317           || !align_b.is_constant (&align_b_c))
2318         return;
2319
2320       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2321
2322       /* Require the gap to be a multiple of the larger vector alignment.  */
2323       if (!multiple_p (diff, max_align))
2324         return;
2325     }
2326
2327   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2328   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2329   if (dump_enabled_p ())
2330     dump_printf_loc (MSG_NOTE, vect_location,
2331                      "accesses have the same alignment: %T and %T\n",
2332                      DR_REF (dra), DR_REF (drb));
2333 }
2334
2335
2336 /* Function vect_analyze_data_refs_alignment
2337
2338    Analyze the alignment of the data-references in the loop.
2339    Return FALSE if a data reference is found that cannot be vectorized.  */
2340
2341 opt_result
2342 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2343 {
2344   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2345
2346   /* Mark groups of data references with same alignment using
2347      data dependence information.  */
2348   vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2349   struct data_dependence_relation *ddr;
2350   unsigned int i;
2351
2352   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2353     vect_find_same_alignment_drs (loop_vinfo, ddr);
2354
2355   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2356   struct data_reference *dr;
2357
2358   vect_record_base_alignments (loop_vinfo);
2359   FOR_EACH_VEC_ELT (datarefs, i, dr)
2360     {
2361       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2362       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2363         vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2364     }
2365
2366   return opt_result::success ();
2367 }
2368
2369
2370 /* Analyze alignment of DRs of stmts in NODE.  */
2371
2372 static bool
2373 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2374 {
2375   /* We vectorize from the first scalar stmt in the node unless
2376      the node is permuted in which case we start from the first
2377      element in the group.  */
2378   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2379   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2380   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2381     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2382
2383   /* We need to commit to a vector type for the group now.  */
2384   if (is_a <bb_vec_info> (vinfo)
2385       && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2386     return false;
2387
2388   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2389   vect_compute_data_ref_alignment (vinfo, dr_info);
2390   /* In several places we need alignment of the first element anyway.  */
2391   if (dr_info != first_dr_info)
2392     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2393
2394   /* For creating the data-ref pointer we need alignment of the
2395      first element as well.  */
2396   first_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
2397   if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2398     {
2399       first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2400       if (dr_info != first_dr_info)
2401         vect_compute_data_ref_alignment (vinfo, first_dr_info);
2402     }
2403
2404   return true;
2405 }
2406
2407 /* Function vect_slp_analyze_instance_alignment
2408
2409    Analyze the alignment of the data-references in the SLP instance.
2410    Return FALSE if a data reference is found that cannot be vectorized.  */
2411
2412 bool
2413 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2414                                                 slp_instance instance)
2415 {
2416   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2417
2418   slp_tree node;
2419   unsigned i;
2420   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2421     if (! vect_slp_analyze_node_alignment (vinfo, node))
2422       return false;
2423
2424   node = SLP_INSTANCE_TREE (instance);
2425   if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2426       && ! vect_slp_analyze_node_alignment
2427              (vinfo, SLP_INSTANCE_TREE (instance)))
2428     return false;
2429
2430   return true;
2431 }
2432
2433
2434 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2435    accesses of legal size, step, etc.  Detect gaps, single element
2436    interleaving, and other special cases. Set grouped access info.
2437    Collect groups of strided stores for further use in SLP analysis.
2438    Worker for vect_analyze_group_access.  */
2439
2440 static bool
2441 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2442 {
2443   data_reference *dr = dr_info->dr;
2444   tree step = DR_STEP (dr);
2445   tree scalar_type = TREE_TYPE (DR_REF (dr));
2446   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2447   stmt_vec_info stmt_info = dr_info->stmt;
2448   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2449   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2450   HOST_WIDE_INT dr_step = -1;
2451   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2452   bool slp_impossible = false;
2453
2454   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2455      size of the interleaving group (including gaps).  */
2456   if (tree_fits_shwi_p (step))
2457     {
2458       dr_step = tree_to_shwi (step);
2459       /* Check that STEP is a multiple of type size.  Otherwise there is
2460          a non-element-sized gap at the end of the group which we
2461          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2462          ???  As we can handle non-constant step fine here we should
2463          simply remove uses of DR_GROUP_GAP between the last and first
2464          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2465          simply not include that gap.  */
2466       if ((dr_step % type_size) != 0)
2467         {
2468           if (dump_enabled_p ())
2469             dump_printf_loc (MSG_NOTE, vect_location,
2470                              "Step %T is not a multiple of the element size"
2471                              " for %T\n",
2472                              step, DR_REF (dr));
2473           return false;
2474         }
2475       groupsize = absu_hwi (dr_step) / type_size;
2476     }
2477   else
2478     groupsize = 0;
2479
2480   /* Not consecutive access is possible only if it is a part of interleaving.  */
2481   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2482     {
2483       /* Check if it this DR is a part of interleaving, and is a single
2484          element of the group that is accessed in the loop.  */
2485
2486       /* Gaps are supported only for loads. STEP must be a multiple of the type
2487          size.  */
2488       if (DR_IS_READ (dr)
2489           && (dr_step % type_size) == 0
2490           && groupsize > 0)
2491         {
2492           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2493           DR_GROUP_SIZE (stmt_info) = groupsize;
2494           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2495           if (dump_enabled_p ())
2496             dump_printf_loc (MSG_NOTE, vect_location,
2497                              "Detected single element interleaving %T"
2498                              " step %T\n",
2499                              DR_REF (dr), step);
2500
2501           return true;
2502         }
2503
2504       if (dump_enabled_p ())
2505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506                          "not consecutive access %G", stmt_info->stmt);
2507
2508       if (bb_vinfo)
2509         {
2510           /* Mark the statement as unvectorizable.  */
2511           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2512           return true;
2513         }
2514
2515       if (dump_enabled_p ())
2516         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2517       STMT_VINFO_STRIDED_P (stmt_info) = true;
2518       return true;
2519     }
2520
2521   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2522     {
2523       /* First stmt in the interleaving chain. Check the chain.  */
2524       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2525       struct data_reference *data_ref = dr;
2526       unsigned int count = 1;
2527       tree prev_init = DR_INIT (data_ref);
2528       HOST_WIDE_INT diff, gaps = 0;
2529
2530       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2531       while (next)
2532         {
2533           /* We never have the same DR multiple times.  */
2534           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2535                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2536
2537           data_ref = STMT_VINFO_DATA_REF (next);
2538
2539           /* All group members have the same STEP by construction.  */
2540           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2541
2542           /* Check that the distance between two accesses is equal to the type
2543              size. Otherwise, we have gaps.  */
2544           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2545                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2546           if (diff != 1)
2547             {
2548               /* FORNOW: SLP of accesses with gaps is not supported.  */
2549               slp_impossible = true;
2550               if (DR_IS_WRITE (data_ref))
2551                 {
2552                   if (dump_enabled_p ())
2553                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                                      "interleaved store with gaps\n");
2555                   return false;
2556                 }
2557
2558               gaps += diff - 1;
2559             }
2560
2561           last_accessed_element += diff;
2562
2563           /* Store the gap from the previous member of the group. If there is no
2564              gap in the access, DR_GROUP_GAP is always 1.  */
2565           DR_GROUP_GAP (next) = diff;
2566
2567           prev_init = DR_INIT (data_ref);
2568           next = DR_GROUP_NEXT_ELEMENT (next);
2569           /* Count the number of data-refs in the chain.  */
2570           count++;
2571         }
2572
2573       if (groupsize == 0)
2574         groupsize = count + gaps;
2575
2576       /* This could be UINT_MAX but as we are generating code in a very
2577          inefficient way we have to cap earlier.  See PR78699 for example.  */
2578       if (groupsize > 4096)
2579         {
2580           if (dump_enabled_p ())
2581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2582                              "group is too large\n");
2583           return false;
2584         }
2585
2586       /* Check that the size of the interleaving is equal to count for stores,
2587          i.e., that there are no gaps.  */
2588       if (groupsize != count
2589           && !DR_IS_READ (dr))
2590         {
2591           groupsize = count;
2592           STMT_VINFO_STRIDED_P (stmt_info) = true;
2593         }
2594
2595       /* If there is a gap after the last load in the group it is the
2596          difference between the groupsize and the last accessed
2597          element.
2598          When there is no gap, this difference should be 0.  */
2599       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2600
2601       DR_GROUP_SIZE (stmt_info) = groupsize;
2602       if (dump_enabled_p ())
2603         {
2604           dump_printf_loc (MSG_NOTE, vect_location,
2605                            "Detected interleaving ");
2606           if (DR_IS_READ (dr))
2607             dump_printf (MSG_NOTE, "load ");
2608           else if (STMT_VINFO_STRIDED_P (stmt_info))
2609             dump_printf (MSG_NOTE, "strided store ");
2610           else
2611             dump_printf (MSG_NOTE, "store ");
2612           dump_printf (MSG_NOTE, "of size %u\n",
2613                        (unsigned)groupsize);
2614           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2615           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2616           while (next)
2617             {
2618               if (DR_GROUP_GAP (next) != 1)
2619                 dump_printf_loc (MSG_NOTE, vect_location,
2620                                  "\t<gap of %d elements>\n",
2621                                  DR_GROUP_GAP (next) - 1);
2622               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2623               next = DR_GROUP_NEXT_ELEMENT (next);
2624             }
2625           if (DR_GROUP_GAP (stmt_info) != 0)
2626             dump_printf_loc (MSG_NOTE, vect_location,
2627                              "\t<gap of %d elements>\n",
2628                              DR_GROUP_GAP (stmt_info));
2629         }
2630
2631       /* SLP: create an SLP data structure for every interleaving group of
2632          stores for further analysis in vect_analyse_slp.  */
2633       if (DR_IS_WRITE (dr) && !slp_impossible)
2634         {
2635           if (loop_vinfo)
2636             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2637           if (bb_vinfo)
2638             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2639         }
2640     }
2641
2642   return true;
2643 }
2644
2645 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2646    accesses of legal size, step, etc.  Detect gaps, single element
2647    interleaving, and other special cases. Set grouped access info.
2648    Collect groups of strided stores for further use in SLP analysis.  */
2649
2650 static bool
2651 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2652 {
2653   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2654     {
2655       /* Dissolve the group if present.  */
2656       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2657       while (stmt_info)
2658         {
2659           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2660           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2661           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2662           stmt_info = next;
2663         }
2664       return false;
2665     }
2666   return true;
2667 }
2668
2669 /* Analyze the access pattern of the data-reference DR_INFO.
2670    In case of non-consecutive accesses call vect_analyze_group_access() to
2671    analyze groups of accesses.  */
2672
2673 static bool
2674 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2675 {
2676   data_reference *dr = dr_info->dr;
2677   tree step = DR_STEP (dr);
2678   tree scalar_type = TREE_TYPE (DR_REF (dr));
2679   stmt_vec_info stmt_info = dr_info->stmt;
2680   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2681   class loop *loop = NULL;
2682
2683   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2684     return true;
2685
2686   if (loop_vinfo)
2687     loop = LOOP_VINFO_LOOP (loop_vinfo);
2688
2689   if (loop_vinfo && !step)
2690     {
2691       if (dump_enabled_p ())
2692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2693                          "bad data-ref access in loop\n");
2694       return false;
2695     }
2696
2697   /* Allow loads with zero step in inner-loop vectorization.  */
2698   if (loop_vinfo && integer_zerop (step))
2699     {
2700       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2701       if (!nested_in_vect_loop_p (loop, stmt_info))
2702         return DR_IS_READ (dr);
2703       /* Allow references with zero step for outer loops marked
2704          with pragma omp simd only - it guarantees absence of
2705          loop-carried dependencies between inner loop iterations.  */
2706       if (loop->safelen < 2)
2707         {
2708           if (dump_enabled_p ())
2709             dump_printf_loc (MSG_NOTE, vect_location,
2710                              "zero step in inner loop of nest\n");
2711           return false;
2712         }
2713     }
2714
2715   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2716     {
2717       /* Interleaved accesses are not yet supported within outer-loop
2718         vectorization for references in the inner-loop.  */
2719       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2720
2721       /* For the rest of the analysis we use the outer-loop step.  */
2722       step = STMT_VINFO_DR_STEP (stmt_info);
2723       if (integer_zerop (step))
2724         {
2725           if (dump_enabled_p ())
2726             dump_printf_loc (MSG_NOTE, vect_location,
2727                              "zero step in outer loop.\n");
2728           return DR_IS_READ (dr);
2729         }
2730     }
2731
2732   /* Consecutive?  */
2733   if (TREE_CODE (step) == INTEGER_CST)
2734     {
2735       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2736       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2737           || (dr_step < 0
2738               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2739         {
2740           /* Mark that it is not interleaving.  */
2741           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2742           return true;
2743         }
2744     }
2745
2746   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2747     {
2748       if (dump_enabled_p ())
2749         dump_printf_loc (MSG_NOTE, vect_location,
2750                          "grouped access in outer loop.\n");
2751       return false;
2752     }
2753
2754
2755   /* Assume this is a DR handled by non-constant strided load case.  */
2756   if (TREE_CODE (step) != INTEGER_CST)
2757     return (STMT_VINFO_STRIDED_P (stmt_info)
2758             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2759                 || vect_analyze_group_access (vinfo, dr_info)));
2760
2761   /* Not consecutive access - check if it's a part of interleaving group.  */
2762   return vect_analyze_group_access (vinfo, dr_info);
2763 }
2764
2765 typedef std::pair<data_reference_p, int> data_ref_pair;
2766
2767 /* Compare two data-references DRA and DRB to group them into chunks
2768    suitable for grouping.  */
2769
2770 static int
2771 dr_group_sort_cmp (const void *dra_, const void *drb_)
2772 {
2773   data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2774   data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2775   data_reference_p dra = dra_pair.first;
2776   data_reference_p drb = drb_pair.first;
2777   int cmp;
2778
2779   /* Stabilize sort.  */
2780   if (dra == drb)
2781     return 0;
2782
2783   /* DRs in different basic-blocks never belong to the same group.  */
2784   int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2785   int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2786   if (bb_index1 != bb_index2)
2787     return bb_index1 < bb_index2 ? -1 : 1;
2788
2789   /* Different group IDs lead never belong to the same group.  */
2790   if (dra_pair.second != drb_pair.second)
2791     return dra_pair.second < drb_pair.second ? -1 : 1;
2792
2793   /* Ordering of DRs according to base.  */
2794   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2795                                DR_BASE_ADDRESS (drb));
2796   if (cmp != 0)
2797     return cmp;
2798
2799   /* And according to DR_OFFSET.  */
2800   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2801   if (cmp != 0)
2802     return cmp;
2803
2804   /* Put reads before writes.  */
2805   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2806     return DR_IS_READ (dra) ? -1 : 1;
2807
2808   /* Then sort after access size.  */
2809   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2810                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2811   if (cmp != 0)
2812     return cmp;
2813
2814   /* And after step.  */
2815   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2816   if (cmp != 0)
2817     return cmp;
2818
2819   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2820   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2821   if (cmp == 0)
2822     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2823   return cmp;
2824 }
2825
2826 /* If OP is the result of a conversion, return the unconverted value,
2827    otherwise return null.  */
2828
2829 static tree
2830 strip_conversion (tree op)
2831 {
2832   if (TREE_CODE (op) != SSA_NAME)
2833     return NULL_TREE;
2834   gimple *stmt = SSA_NAME_DEF_STMT (op);
2835   if (!is_gimple_assign (stmt)
2836       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2837     return NULL_TREE;
2838   return gimple_assign_rhs1 (stmt);
2839 }
2840
2841 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2842    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2843    be grouped in SLP mode.  */
2844
2845 static bool
2846 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2847                    bool allow_slp_p)
2848 {
2849   if (gimple_assign_single_p (stmt1_info->stmt))
2850     return gimple_assign_single_p (stmt2_info->stmt);
2851
2852   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2853   if (call1 && gimple_call_internal_p (call1))
2854     {
2855       /* Check for two masked loads or two masked stores.  */
2856       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2857       if (!call2 || !gimple_call_internal_p (call2))
2858         return false;
2859       internal_fn ifn = gimple_call_internal_fn (call1);
2860       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2861         return false;
2862       if (ifn != gimple_call_internal_fn (call2))
2863         return false;
2864
2865       /* Check that the masks are the same.  Cope with casts of masks,
2866          like those created by build_mask_conversion.  */
2867       tree mask1 = gimple_call_arg (call1, 2);
2868       tree mask2 = gimple_call_arg (call2, 2);
2869       if (!operand_equal_p (mask1, mask2, 0)
2870           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2871         {
2872           mask1 = strip_conversion (mask1);
2873           if (!mask1)
2874             return false;
2875           mask2 = strip_conversion (mask2);
2876           if (!mask2)
2877             return false;
2878           if (!operand_equal_p (mask1, mask2, 0))
2879             return false;
2880         }
2881       return true;
2882     }
2883
2884   return false;
2885 }
2886
2887 /* Function vect_analyze_data_ref_accesses.
2888
2889    Analyze the access pattern of all the data references in the loop.
2890
2891    FORNOW: the only access pattern that is considered vectorizable is a
2892            simple step 1 (consecutive) access.
2893
2894    FORNOW: handle only arrays and pointer accesses.  */
2895
2896 opt_result
2897 vect_analyze_data_ref_accesses (vec_info *vinfo,
2898                                 vec<int> *dataref_groups)
2899 {
2900   unsigned int i;
2901   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2902
2903   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2904
2905   if (datarefs.is_empty ())
2906     return opt_result::success ();
2907
2908   /* Sort the array of datarefs to make building the interleaving chains
2909      linear.  Don't modify the original vector's order, it is needed for
2910      determining what dependencies are reversed.  */
2911   vec<data_ref_pair> datarefs_copy;
2912   datarefs_copy.create (datarefs.length ());
2913   for (unsigned i = 0; i < datarefs.length (); i++)
2914     {
2915       int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2916       datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2917     }
2918   datarefs_copy.qsort (dr_group_sort_cmp);
2919   hash_set<stmt_vec_info> to_fixup;
2920
2921   /* Build the interleaving chains.  */
2922   for (i = 0; i < datarefs_copy.length () - 1;)
2923     {
2924       data_reference_p dra = datarefs_copy[i].first;
2925       int dra_group_id = datarefs_copy[i].second;
2926       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2927       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2928       stmt_vec_info lastinfo = NULL;
2929       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2930           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2931         {
2932           ++i;
2933           continue;
2934         }
2935       for (i = i + 1; i < datarefs_copy.length (); ++i)
2936         {
2937           data_reference_p drb = datarefs_copy[i].first;
2938           int drb_group_id = datarefs_copy[i].second;
2939           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2940           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2941           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2942               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2943             break;
2944
2945           /* ???  Imperfect sorting (non-compatible types, non-modulo
2946              accesses, same accesses) can lead to a group to be artificially
2947              split here as we don't just skip over those.  If it really
2948              matters we can push those to a worklist and re-iterate
2949              over them.  The we can just skip ahead to the next DR here.  */
2950
2951           /* DRs in a different BBs should not be put into the same
2952              interleaving group.  */
2953           int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2954           int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2955           if (bb_index1 != bb_index2)
2956             break;
2957
2958           if (dra_group_id != drb_group_id)
2959             break;
2960
2961           /* Check that the data-refs have same first location (except init)
2962              and they are both either store or load (not load and store,
2963              not masked loads or stores).  */
2964           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2965               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2966                                         DR_BASE_ADDRESS (drb)) != 0
2967               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2968               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2969             break;
2970
2971           /* Check that the data-refs have the same constant size.  */
2972           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2973           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2974           if (!tree_fits_uhwi_p (sza)
2975               || !tree_fits_uhwi_p (szb)
2976               || !tree_int_cst_equal (sza, szb))
2977             break;
2978
2979           /* Check that the data-refs have the same step.  */
2980           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2981             break;
2982
2983           /* Check the types are compatible.
2984              ???  We don't distinguish this during sorting.  */
2985           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2986                                    TREE_TYPE (DR_REF (drb))))
2987             break;
2988
2989           /* Check that the DR_INITs are compile-time constants.  */
2990           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2991               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2992             break;
2993
2994           /* Different .GOMP_SIMD_LANE calls still give the same lane,
2995              just hold extra information.  */
2996           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
2997               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
2998               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
2999             break;
3000
3001           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3002           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004           HOST_WIDE_INT init_prev
3005             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3006           gcc_assert (init_a <= init_b
3007                       && init_a <= init_prev
3008                       && init_prev <= init_b);
3009
3010           /* Do not place the same access in the interleaving chain twice.  */
3011           if (init_b == init_prev)
3012             {
3013               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3014                           < gimple_uid (DR_STMT (drb)));
3015               /* Simply link in duplicates and fix up the chain below.  */
3016             }
3017           else
3018             {
3019               /* If init_b == init_a + the size of the type * k, we have an
3020                  interleaving, and DRA is accessed before DRB.  */
3021               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3022               if (type_size_a == 0
3023                   || (init_b - init_a) % type_size_a != 0)
3024                 break;
3025
3026               /* If we have a store, the accesses are adjacent.  This splits
3027                  groups into chunks we support (we don't support vectorization
3028                  of stores with gaps).  */
3029               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3030                 break;
3031
3032               /* If the step (if not zero or non-constant) is smaller than the
3033                  difference between data-refs' inits this splits groups into
3034                  suitable sizes.  */
3035               if (tree_fits_shwi_p (DR_STEP (dra)))
3036                 {
3037                   unsigned HOST_WIDE_INT step
3038                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3039                   if (step != 0
3040                       && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3041                     break;
3042                 }
3043             }
3044
3045           if (dump_enabled_p ())
3046             dump_printf_loc (MSG_NOTE, vect_location,
3047                              DR_IS_READ (dra)
3048                              ? "Detected interleaving load %T and %T\n"
3049                              : "Detected interleaving store %T and %T\n",
3050                              DR_REF (dra), DR_REF (drb));
3051
3052           /* Link the found element into the group list.  */
3053           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3054             {
3055               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3056               lastinfo = stmtinfo_a;
3057             }
3058           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3059           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3060           lastinfo = stmtinfo_b;
3061
3062           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3063             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3064
3065           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3066             dump_printf_loc (MSG_NOTE, vect_location,
3067                              "Load suitable for SLP vectorization only.\n");
3068
3069           if (init_b == init_prev
3070               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3071               && dump_enabled_p ())
3072             dump_printf_loc (MSG_NOTE, vect_location,
3073                              "Queuing group with duplicate access for fixup\n");
3074         }
3075     }
3076
3077   /* Fixup groups with duplicate entries by splitting it.  */
3078   while (1)
3079     {
3080       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3081       if (!(it != to_fixup.end ()))
3082         break;
3083       stmt_vec_info grp = *it;
3084       to_fixup.remove (grp);
3085
3086       /* Find the earliest duplicate group member.  */
3087       unsigned first_duplicate = -1u;
3088       stmt_vec_info next, g = grp;
3089       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3090         {
3091           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3092                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3093               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3094             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3095           g = next;
3096         }
3097       if (first_duplicate == -1U)
3098         continue;
3099
3100       /* Then move all stmts after the first duplicate to a new group.
3101          Note this is a heuristic but one with the property that *it
3102          is fixed up completely.  */
3103       g = grp;
3104       stmt_vec_info newgroup = NULL, ng = grp;
3105       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3106         {
3107           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3108             {
3109               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3110               if (!newgroup)
3111                 newgroup = next;
3112               else
3113                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3114               ng = next;
3115               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3116             }
3117           else
3118             g = DR_GROUP_NEXT_ELEMENT (g);
3119         }
3120       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3121
3122       /* Fixup the new group which still may contain duplicates.  */
3123       to_fixup.add (newgroup);
3124     }
3125
3126   data_ref_pair *dr_pair;
3127   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3128     {
3129       dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3130       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3131           && !vect_analyze_data_ref_access (vinfo, dr_info))
3132         {
3133           if (dump_enabled_p ())
3134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3135                              "not vectorized: complicated access pattern.\n");
3136
3137           if (is_a <bb_vec_info> (vinfo))
3138             {
3139               /* Mark the statement as not vectorizable.  */
3140               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3141               continue;
3142             }
3143           else
3144             {
3145               datarefs_copy.release ();
3146               return opt_result::failure_at (dr_info->stmt->stmt,
3147                                              "not vectorized:"
3148                                              " complicated access pattern.\n");
3149             }
3150         }
3151     }
3152
3153   datarefs_copy.release ();
3154   return opt_result::success ();
3155 }
3156
3157 /* Function vect_vfa_segment_size.
3158
3159    Input:
3160      DR_INFO: The data reference.
3161      LENGTH_FACTOR: segment length to consider.
3162
3163    Return a value suitable for the dr_with_seg_len::seg_len field.
3164    This is the "distance travelled" by the pointer from the first
3165    iteration in the segment to the last.  Note that it does not include
3166    the size of the access; in effect it only describes the first byte.  */
3167
3168 static tree
3169 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3170 {
3171   length_factor = size_binop (MINUS_EXPR,
3172                               fold_convert (sizetype, length_factor),
3173                               size_one_node);
3174   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3175                      length_factor);
3176 }
3177
3178 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3179    gives the worst-case number of bytes covered by the segment.  */
3180
3181 static unsigned HOST_WIDE_INT
3182 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3183 {
3184   stmt_vec_info stmt_vinfo = dr_info->stmt;
3185   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3186   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3187   unsigned HOST_WIDE_INT access_size = ref_size;
3188   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3189     {
3190       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3191       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3192     }
3193   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3194       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3195           == dr_explicit_realign_optimized))
3196     {
3197       /* We might access a full vector's worth.  */
3198       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3199       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3200     }
3201   return access_size;
3202 }
3203
3204 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3205    describes.  */
3206
3207 static unsigned int
3208 vect_vfa_align (dr_vec_info *dr_info)
3209 {
3210   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3211 }
3212
3213 /* Function vect_no_alias_p.
3214
3215    Given data references A and B with equal base and offset, see whether
3216    the alias relation can be decided at compilation time.  Return 1 if
3217    it can and the references alias, 0 if it can and the references do
3218    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3219    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3220    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3221
3222 static int
3223 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3224                          tree segment_length_a, tree segment_length_b,
3225                          unsigned HOST_WIDE_INT access_size_a,
3226                          unsigned HOST_WIDE_INT access_size_b)
3227 {
3228   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3229   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3230   poly_uint64 const_length_a;
3231   poly_uint64 const_length_b;
3232
3233   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3234      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3235      [a, a+12) */
3236   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3237     {
3238       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3239       offset_a -= const_length_a;
3240     }
3241   else
3242     const_length_a = tree_to_poly_uint64 (segment_length_a);
3243   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3244     {
3245       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3246       offset_b -= const_length_b;
3247     }
3248   else
3249     const_length_b = tree_to_poly_uint64 (segment_length_b);
3250
3251   const_length_a += access_size_a;
3252   const_length_b += access_size_b;
3253
3254   if (ranges_known_overlap_p (offset_a, const_length_a,
3255                               offset_b, const_length_b))
3256     return 1;
3257
3258   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3259                                offset_b, const_length_b))
3260     return 0;
3261
3262   return -1;
3263 }
3264
3265 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3266    in DDR is >= VF.  */
3267
3268 static bool
3269 dependence_distance_ge_vf (data_dependence_relation *ddr,
3270                            unsigned int loop_depth, poly_uint64 vf)
3271 {
3272   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3273       || DDR_NUM_DIST_VECTS (ddr) == 0)
3274     return false;
3275
3276   /* If the dependence is exact, we should have limited the VF instead.  */
3277   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3278
3279   unsigned int i;
3280   lambda_vector dist_v;
3281   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3282     {
3283       HOST_WIDE_INT dist = dist_v[loop_depth];
3284       if (dist != 0
3285           && !(dist > 0 && DDR_REVERSED_P (ddr))
3286           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3287         return false;
3288     }
3289
3290   if (dump_enabled_p ())
3291     dump_printf_loc (MSG_NOTE, vect_location,
3292                      "dependence distance between %T and %T is >= VF\n",
3293                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3294
3295   return true;
3296 }
3297
3298 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3299
3300 static void
3301 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3302 {
3303   dump_printf (dump_kind, "%s (%T) >= ",
3304                lower_bound.unsigned_p ? "unsigned" : "abs",
3305                lower_bound.expr);
3306   dump_dec (dump_kind, lower_bound.min_value);
3307 }
3308
3309 /* Record that the vectorized loop requires the vec_lower_bound described
3310    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3311
3312 static void
3313 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3314                         poly_uint64 min_value)
3315 {
3316   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3317   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3318     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3319       {
3320         unsigned_p &= lower_bounds[i].unsigned_p;
3321         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3322         if (lower_bounds[i].unsigned_p != unsigned_p
3323             || maybe_lt (lower_bounds[i].min_value, min_value))
3324           {
3325             lower_bounds[i].unsigned_p = unsigned_p;
3326             lower_bounds[i].min_value = min_value;
3327             if (dump_enabled_p ())
3328               {
3329                 dump_printf_loc (MSG_NOTE, vect_location,
3330                                  "updating run-time check to ");
3331                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3332                 dump_printf (MSG_NOTE, "\n");
3333               }
3334           }
3335         return;
3336       }
3337
3338   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3339   if (dump_enabled_p ())
3340     {
3341       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3342       dump_lower_bound (MSG_NOTE, lower_bound);
3343       dump_printf (MSG_NOTE, "\n");
3344     }
3345   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3346 }
3347
3348 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3349    will span fewer than GAP bytes.  */
3350
3351 static bool
3352 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3353                   poly_int64 gap)
3354 {
3355   stmt_vec_info stmt_info = dr_info->stmt;
3356   HOST_WIDE_INT count
3357     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3358   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3359     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3360   return (estimated_poly_value (gap)
3361           <= count * vect_get_scalar_dr_size (dr_info));
3362 }
3363
3364 /* Return true if we know that there is no alias between DR_INFO_A and
3365    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3366    When returning true, set *LOWER_BOUND_OUT to this N.  */
3367
3368 static bool
3369 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3370                                 poly_uint64 *lower_bound_out)
3371 {
3372   /* Check that there is a constant gap of known sign between DR_A
3373      and DR_B.  */
3374   data_reference *dr_a = dr_info_a->dr;
3375   data_reference *dr_b = dr_info_b->dr;
3376   poly_int64 init_a, init_b;
3377   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3378       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3379       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3380       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3381       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3382       || !ordered_p (init_a, init_b))
3383     return false;
3384
3385   /* Sort DR_A and DR_B by the address they access.  */
3386   if (maybe_lt (init_b, init_a))
3387     {
3388       std::swap (init_a, init_b);
3389       std::swap (dr_info_a, dr_info_b);
3390       std::swap (dr_a, dr_b);
3391     }
3392
3393   /* If the two accesses could be dependent within a scalar iteration,
3394      make sure that we'd retain their order.  */
3395   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3396       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3397     return false;
3398
3399   /* There is no alias if abs (DR_STEP) is greater than or equal to
3400      the bytes spanned by the combination of the two accesses.  */
3401   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3402   return true;
3403 }
3404
3405 /* Function vect_prune_runtime_alias_test_list.
3406
3407    Prune a list of ddrs to be tested at run-time by versioning for alias.
3408    Merge several alias checks into one if possible.
3409    Return FALSE if resulting list of ddrs is longer then allowed by
3410    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3411
3412 opt_result
3413 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3414 {
3415   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3416   hash_set <tree_pair_hash> compared_objects;
3417
3418   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3419   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3420     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3421   vec<vec_object_pair> &check_unequal_addrs
3422     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3423   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3424   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3425
3426   ddr_p ddr;
3427   unsigned int i;
3428   tree length_factor;
3429
3430   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3431
3432   /* Step values are irrelevant for aliasing if the number of vector
3433      iterations is equal to the number of scalar iterations (which can
3434      happen for fully-SLP loops).  */
3435   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3436
3437   if (!ignore_step_p)
3438     {
3439       /* Convert the checks for nonzero steps into bound tests.  */
3440       tree value;
3441       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3442         vect_check_lower_bound (loop_vinfo, value, true, 1);
3443     }
3444
3445   if (may_alias_ddrs.is_empty ())
3446     return opt_result::success ();
3447
3448   comp_alias_ddrs.create (may_alias_ddrs.length ());
3449
3450   unsigned int loop_depth
3451     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3452                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3453
3454   /* First, we collect all data ref pairs for aliasing checks.  */
3455   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3456     {
3457       poly_uint64 lower_bound;
3458       tree segment_length_a, segment_length_b;
3459       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3460       unsigned int align_a, align_b;
3461
3462       /* Ignore the alias if the VF we chose ended up being no greater
3463          than the dependence distance.  */
3464       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3465         continue;
3466
3467       if (DDR_OBJECT_A (ddr))
3468         {
3469           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3470           if (!compared_objects.add (new_pair))
3471             {
3472               if (dump_enabled_p ())
3473                 dump_printf_loc (MSG_NOTE, vect_location,
3474                                  "checking that %T and %T"
3475                                  " have different addresses\n",
3476                                  new_pair.first, new_pair.second);
3477               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3478             }
3479           continue;
3480         }
3481
3482       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3483       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3484
3485       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3486       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3487
3488       bool preserves_scalar_order_p
3489         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3490
3491       /* Skip the pair if inter-iteration dependencies are irrelevant
3492          and intra-iteration dependencies are guaranteed to be honored.  */
3493       if (ignore_step_p
3494           && (preserves_scalar_order_p
3495               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3496                                                  &lower_bound)))
3497         {
3498           if (dump_enabled_p ())
3499             dump_printf_loc (MSG_NOTE, vect_location,
3500                              "no need for alias check between "
3501                              "%T and %T when VF is 1\n",
3502                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3503           continue;
3504         }
3505
3506       /* See whether we can handle the alias using a bounds check on
3507          the step, and whether that's likely to be the best approach.
3508          (It might not be, for example, if the minimum step is much larger
3509          than the number of bytes handled by one vector iteration.)  */
3510       if (!ignore_step_p
3511           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3512           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3513                                              &lower_bound)
3514           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3515               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3516         {
3517           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3518           if (dump_enabled_p ())
3519             {
3520               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3521                                "%T and %T when the step %T is outside ",
3522                                DR_REF (dr_info_a->dr),
3523                                DR_REF (dr_info_b->dr),
3524                                DR_STEP (dr_info_a->dr));
3525               if (unsigned_p)
3526                 dump_printf (MSG_NOTE, "[0");
3527               else
3528                 {
3529                   dump_printf (MSG_NOTE, "(");
3530                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3531                 }
3532               dump_printf (MSG_NOTE, ", ");
3533               dump_dec (MSG_NOTE, lower_bound);
3534               dump_printf (MSG_NOTE, ")\n");
3535             }
3536           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3537                                   unsigned_p, lower_bound);
3538           continue;
3539         }
3540
3541       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3542       if (dr_group_first_a)
3543         {
3544           stmt_info_a = dr_group_first_a;
3545           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3546         }
3547
3548       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3549       if (dr_group_first_b)
3550         {
3551           stmt_info_b = dr_group_first_b;
3552           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3553         }
3554
3555       if (ignore_step_p)
3556         {
3557           segment_length_a = size_zero_node;
3558           segment_length_b = size_zero_node;
3559         }
3560       else
3561         {
3562           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3563                                 DR_STEP (dr_info_b->dr), 0))
3564             length_factor = scalar_loop_iters;
3565           else
3566             length_factor = size_int (vect_factor);
3567           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3568           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3569         }
3570       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3571       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3572       align_a = vect_vfa_align (dr_info_a);
3573       align_b = vect_vfa_align (dr_info_b);
3574
3575       /* See whether the alias is known at compilation time.  */
3576       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3577                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3578           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3579                               DR_OFFSET (dr_info_b->dr), 0)
3580           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3581           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3582           && poly_int_tree_p (segment_length_a)
3583           && poly_int_tree_p (segment_length_b))
3584         {
3585           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3586                                              segment_length_a,
3587                                              segment_length_b,
3588                                              access_size_a,
3589                                              access_size_b);
3590           if (res >= 0 && dump_enabled_p ())
3591             {
3592               dump_printf_loc (MSG_NOTE, vect_location,
3593                                "can tell at compile time that %T and %T",
3594                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3595               if (res == 0)
3596                 dump_printf (MSG_NOTE, " do not alias\n");
3597               else
3598                 dump_printf (MSG_NOTE, " alias\n");
3599             }
3600
3601           if (res == 0)
3602             continue;
3603
3604           if (res == 1)
3605             return opt_result::failure_at (stmt_info_b->stmt,
3606                                            "not vectorized:"
3607                                            " compilation time alias: %G%G",
3608                                            stmt_info_a->stmt,
3609                                            stmt_info_b->stmt);
3610         }
3611
3612       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3613                             access_size_a, align_a);
3614       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3615                             access_size_b, align_b);
3616       /* Canonicalize the order to be the one that's needed for accurate
3617          RAW, WAR and WAW flags, in cases where the data references are
3618          well-ordered.  The order doesn't really matter otherwise,
3619          but we might as well be consistent.  */
3620       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3621         std::swap (dr_a, dr_b);
3622
3623       dr_with_seg_len_pair_t dr_with_seg_len_pair
3624         (dr_a, dr_b, (preserves_scalar_order_p
3625                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3626                       : dr_with_seg_len_pair_t::REORDERED));
3627
3628       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3629     }
3630
3631   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3632
3633   unsigned int count = (comp_alias_ddrs.length ()
3634                         + check_unequal_addrs.length ());
3635
3636   if (dump_enabled_p ())
3637     dump_printf_loc (MSG_NOTE, vect_location,
3638                      "improved number of alias checks from %d to %d\n",
3639                      may_alias_ddrs.length (), count);
3640   unsigned limit = param_vect_max_version_for_alias_checks;
3641   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3642     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3643   if (count > limit)
3644     return opt_result::failure_at
3645       (vect_location,
3646        "number of versioning for alias run-time tests exceeds %d "
3647        "(--param vect-max-version-for-alias-checks)\n", limit);
3648
3649   return opt_result::success ();
3650 }
3651
3652 /* Check whether we can use an internal function for a gather load
3653    or scatter store.  READ_P is true for loads and false for stores.
3654    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3655    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3656    is the type of the offset that is being applied to the invariant
3657    base address.  SCALE is the amount by which the offset should
3658    be multiplied *after* it has been converted to address width.
3659
3660    Return true if the function is supported, storing the function id in
3661    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3662
3663 bool
3664 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3665                           tree vectype, tree memory_type, tree offset_type,
3666                           int scale, internal_fn *ifn_out,
3667                           tree *offset_vectype_out)
3668 {
3669   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3670   unsigned int element_bits = vector_element_bits (vectype);
3671   if (element_bits != memory_bits)
3672     /* For now the vector elements must be the same width as the
3673        memory elements.  */
3674     return false;
3675
3676   /* Work out which function we need.  */
3677   internal_fn ifn;
3678   if (read_p)
3679     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3680   else
3681     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3682
3683   for (;;)
3684     {
3685       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3686       if (!offset_vectype)
3687         return false;
3688
3689       /* Test whether the target supports this combination.  */
3690       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3691                                                   offset_vectype, scale))
3692         {
3693           *ifn_out = ifn;
3694           *offset_vectype_out = offset_vectype;
3695           return true;
3696         }
3697
3698       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3699           && TYPE_PRECISION (offset_type) >= element_bits)
3700         return false;
3701
3702       offset_type = build_nonstandard_integer_type
3703         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3704     }
3705 }
3706
3707 /* STMT_INFO is a call to an internal gather load or scatter store function.
3708    Describe the operation in INFO.  */
3709
3710 static void
3711 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3712                                    gather_scatter_info *info)
3713 {
3714   gcall *call = as_a <gcall *> (stmt_info->stmt);
3715   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3716   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3717
3718   info->ifn = gimple_call_internal_fn (call);
3719   info->decl = NULL_TREE;
3720   info->base = gimple_call_arg (call, 0);
3721   info->offset = gimple_call_arg (call, 1);
3722   info->offset_dt = vect_unknown_def_type;
3723   info->offset_vectype = NULL_TREE;
3724   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3725   info->element_type = TREE_TYPE (vectype);
3726   info->memory_type = TREE_TYPE (DR_REF (dr));
3727 }
3728
3729 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3730    gather load or scatter store.  Describe the operation in *INFO if so.  */
3731
3732 bool
3733 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3734                            gather_scatter_info *info)
3735 {
3736   HOST_WIDE_INT scale = 1;
3737   poly_int64 pbitpos, pbitsize;
3738   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3739   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740   tree offtype = NULL_TREE;
3741   tree decl = NULL_TREE, base, off;
3742   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3743   tree memory_type = TREE_TYPE (DR_REF (dr));
3744   machine_mode pmode;
3745   int punsignedp, reversep, pvolatilep = 0;
3746   internal_fn ifn;
3747   tree offset_vectype;
3748   bool masked_p = false;
3749
3750   /* See whether this is already a call to a gather/scatter internal function.
3751      If not, see whether it's a masked load or store.  */
3752   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3753   if (call && gimple_call_internal_p (call))
3754     {
3755       ifn = gimple_call_internal_fn (call);
3756       if (internal_gather_scatter_fn_p (ifn))
3757         {
3758           vect_describe_gather_scatter_call (stmt_info, info);
3759           return true;
3760         }
3761       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3762     }
3763
3764   /* True if we should aim to use internal functions rather than
3765      built-in functions.  */
3766   bool use_ifn_p = (DR_IS_READ (dr)
3767                     ? supports_vec_gather_load_p ()
3768                     : supports_vec_scatter_store_p ());
3769
3770   base = DR_REF (dr);
3771   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3772      see if we can use the def stmt of the address.  */
3773   if (masked_p
3774       && TREE_CODE (base) == MEM_REF
3775       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3776       && integer_zerop (TREE_OPERAND (base, 1))
3777       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3778     {
3779       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3780       if (is_gimple_assign (def_stmt)
3781           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3782         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3783     }
3784
3785   /* The gather and scatter builtins need address of the form
3786      loop_invariant + vector * {1, 2, 4, 8}
3787      or
3788      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3789      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3790      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3791      multiplications and additions in it.  To get a vector, we need
3792      a single SSA_NAME that will be defined in the loop and will
3793      contain everything that is not loop invariant and that can be
3794      vectorized.  The following code attempts to find such a preexistng
3795      SSA_NAME OFF and put the loop invariants into a tree BASE
3796      that can be gimplified before the loop.  */
3797   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3798                               &punsignedp, &reversep, &pvolatilep);
3799   if (reversep)
3800     return false;
3801
3802   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3803
3804   if (TREE_CODE (base) == MEM_REF)
3805     {
3806       if (!integer_zerop (TREE_OPERAND (base, 1)))
3807         {
3808           if (off == NULL_TREE)
3809             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3810           else
3811             off = size_binop (PLUS_EXPR, off,
3812                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3813         }
3814       base = TREE_OPERAND (base, 0);
3815     }
3816   else
3817     base = build_fold_addr_expr (base);
3818
3819   if (off == NULL_TREE)
3820     off = size_zero_node;
3821
3822   /* If base is not loop invariant, either off is 0, then we start with just
3823      the constant offset in the loop invariant BASE and continue with base
3824      as OFF, otherwise give up.
3825      We could handle that case by gimplifying the addition of base + off
3826      into some SSA_NAME and use that as off, but for now punt.  */
3827   if (!expr_invariant_in_loop_p (loop, base))
3828     {
3829       if (!integer_zerop (off))
3830         return false;
3831       off = base;
3832       base = size_int (pbytepos);
3833     }
3834   /* Otherwise put base + constant offset into the loop invariant BASE
3835      and continue with OFF.  */
3836   else
3837     {
3838       base = fold_convert (sizetype, base);
3839       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3840     }
3841
3842   /* OFF at this point may be either a SSA_NAME or some tree expression
3843      from get_inner_reference.  Try to peel off loop invariants from it
3844      into BASE as long as possible.  */
3845   STRIP_NOPS (off);
3846   while (offtype == NULL_TREE)
3847     {
3848       enum tree_code code;
3849       tree op0, op1, add = NULL_TREE;
3850
3851       if (TREE_CODE (off) == SSA_NAME)
3852         {
3853           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3854
3855           if (expr_invariant_in_loop_p (loop, off))
3856             return false;
3857
3858           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3859             break;
3860
3861           op0 = gimple_assign_rhs1 (def_stmt);
3862           code = gimple_assign_rhs_code (def_stmt);
3863           op1 = gimple_assign_rhs2 (def_stmt);
3864         }
3865       else
3866         {
3867           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3868             return false;
3869           code = TREE_CODE (off);
3870           extract_ops_from_tree (off, &code, &op0, &op1);
3871         }
3872       switch (code)
3873         {
3874         case POINTER_PLUS_EXPR:
3875         case PLUS_EXPR:
3876           if (expr_invariant_in_loop_p (loop, op0))
3877             {
3878               add = op0;
3879               off = op1;
3880             do_add:
3881               add = fold_convert (sizetype, add);
3882               if (scale != 1)
3883                 add = size_binop (MULT_EXPR, add, size_int (scale));
3884               base = size_binop (PLUS_EXPR, base, add);
3885               continue;
3886             }
3887           if (expr_invariant_in_loop_p (loop, op1))
3888             {
3889               add = op1;
3890               off = op0;
3891               goto do_add;
3892             }
3893           break;
3894         case MINUS_EXPR:
3895           if (expr_invariant_in_loop_p (loop, op1))
3896             {
3897               add = fold_convert (sizetype, op1);
3898               add = size_binop (MINUS_EXPR, size_zero_node, add);
3899               off = op0;
3900               goto do_add;
3901             }
3902           break;
3903         case MULT_EXPR:
3904           if (scale == 1 && tree_fits_shwi_p (op1))
3905             {
3906               int new_scale = tree_to_shwi (op1);
3907               /* Only treat this as a scaling operation if the target
3908                  supports it for at least some offset type.  */
3909               if (use_ifn_p
3910                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3911                                                 masked_p, vectype, memory_type,
3912                                                 signed_char_type_node,
3913                                                 new_scale, &ifn,
3914                                                 &offset_vectype)
3915                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3916                                                 masked_p, vectype, memory_type,
3917                                                 unsigned_char_type_node,
3918                                                 new_scale, &ifn,
3919                                                 &offset_vectype))
3920                 break;
3921               scale = new_scale;
3922               off = op0;
3923               continue;
3924             }
3925           break;
3926         case SSA_NAME:
3927           off = op0;
3928           continue;
3929         CASE_CONVERT:
3930           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3931               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3932             break;
3933
3934           /* Don't include the conversion if the target is happy with
3935              the current offset type.  */
3936           if (use_ifn_p
3937               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3938                                            masked_p, vectype, memory_type,
3939                                            TREE_TYPE (off), scale, &ifn,
3940                                            &offset_vectype))
3941             break;
3942
3943           if (TYPE_PRECISION (TREE_TYPE (op0))
3944               == TYPE_PRECISION (TREE_TYPE (off)))
3945             {
3946               off = op0;
3947               continue;
3948             }
3949
3950           if (TYPE_PRECISION (TREE_TYPE (op0))
3951               < TYPE_PRECISION (TREE_TYPE (off)))
3952             {
3953               off = op0;
3954               offtype = TREE_TYPE (off);
3955               STRIP_NOPS (off);
3956               continue;
3957             }
3958           break;
3959         default:
3960           break;
3961         }
3962       break;
3963     }
3964
3965   /* If at the end OFF still isn't a SSA_NAME or isn't
3966      defined in the loop, punt.  */
3967   if (TREE_CODE (off) != SSA_NAME
3968       || expr_invariant_in_loop_p (loop, off))
3969     return false;
3970
3971   if (offtype == NULL_TREE)
3972     offtype = TREE_TYPE (off);
3973
3974   if (use_ifn_p)
3975     {
3976       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
3977                                      vectype, memory_type, offtype, scale,
3978                                      &ifn, &offset_vectype))
3979         return false;
3980     }
3981   else
3982     {
3983       if (DR_IS_READ (dr))
3984         {
3985           if (targetm.vectorize.builtin_gather)
3986             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3987         }
3988       else
3989         {
3990           if (targetm.vectorize.builtin_scatter)
3991             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3992         }
3993
3994       if (!decl)
3995         return false;
3996
3997       ifn = IFN_LAST;
3998       /* The offset vector type will be read from DECL when needed.  */
3999       offset_vectype = NULL_TREE;
4000     }
4001
4002   info->ifn = ifn;
4003   info->decl = decl;
4004   info->base = base;
4005   info->offset = off;
4006   info->offset_dt = vect_unknown_def_type;
4007   info->offset_vectype = offset_vectype;
4008   info->scale = scale;
4009   info->element_type = TREE_TYPE (vectype);
4010   info->memory_type = memory_type;
4011   return true;
4012 }
4013
4014 /* Find the data references in STMT, analyze them with respect to LOOP and
4015    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4016    be handled.  */
4017
4018 opt_result
4019 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4020                                vec<data_reference_p> *datarefs,
4021                                vec<int> *dataref_groups, int group_id)
4022 {
4023   /* We can ignore clobbers for dataref analysis - they are removed during
4024      loop vectorization and BB vectorization checks dependences with a
4025      stmt walk.  */
4026   if (gimple_clobber_p (stmt))
4027     return opt_result::success ();
4028
4029   if (gimple_has_volatile_ops (stmt))
4030     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4031                                    stmt);
4032
4033   if (stmt_can_throw_internal (cfun, stmt))
4034     return opt_result::failure_at (stmt,
4035                                    "not vectorized:"
4036                                    " statement can throw an exception: %G",
4037                                    stmt);
4038
4039   auto_vec<data_reference_p, 2> refs;
4040   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4041   if (!res)
4042     return res;
4043
4044   if (refs.is_empty ())
4045     return opt_result::success ();
4046
4047   if (refs.length () > 1)
4048     return opt_result::failure_at (stmt,
4049                                    "not vectorized:"
4050                                    " more than one data ref in stmt: %G", stmt);
4051
4052   if (gcall *call = dyn_cast <gcall *> (stmt))
4053     if (!gimple_call_internal_p (call)
4054         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4055             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4056       return opt_result::failure_at (stmt,
4057                                      "not vectorized: dr in a call %G", stmt);
4058
4059   data_reference_p dr = refs.pop ();
4060   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4061       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4062     return opt_result::failure_at (stmt,
4063                                    "not vectorized:"
4064                                    " statement is bitfield access %G", stmt);
4065
4066   if (DR_BASE_ADDRESS (dr)
4067       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4068     return opt_result::failure_at (stmt,
4069                                    "not vectorized:"
4070                                    " base addr of dr is a constant\n");
4071
4072   /* Check whether this may be a SIMD lane access and adjust the
4073      DR to make it easier for us to handle it.  */
4074   if (loop
4075       && loop->simduid
4076       && (!DR_BASE_ADDRESS (dr)
4077           || !DR_OFFSET (dr)
4078           || !DR_INIT (dr)
4079           || !DR_STEP (dr)))
4080     {
4081       struct data_reference *newdr
4082         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4083                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4084       if (DR_BASE_ADDRESS (newdr)
4085           && DR_OFFSET (newdr)
4086           && DR_INIT (newdr)
4087           && DR_STEP (newdr)
4088           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4089           && integer_zerop (DR_STEP (newdr)))
4090         {
4091           tree base_address = DR_BASE_ADDRESS (newdr);
4092           tree off = DR_OFFSET (newdr);
4093           tree step = ssize_int (1);
4094           if (integer_zerop (off)
4095               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4096             {
4097               off = TREE_OPERAND (base_address, 1);
4098               base_address = TREE_OPERAND (base_address, 0);
4099             }
4100           STRIP_NOPS (off);
4101           if (TREE_CODE (off) == MULT_EXPR
4102               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4103             {
4104               step = TREE_OPERAND (off, 1);
4105               off = TREE_OPERAND (off, 0);
4106               STRIP_NOPS (off);
4107             }
4108           if (CONVERT_EXPR_P (off)
4109               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4110                   < TYPE_PRECISION (TREE_TYPE (off))))
4111             off = TREE_OPERAND (off, 0);
4112           if (TREE_CODE (off) == SSA_NAME)
4113             {
4114               gimple *def = SSA_NAME_DEF_STMT (off);
4115               /* Look through widening conversion.  */
4116               if (is_gimple_assign (def)
4117                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4118                 {
4119                   tree rhs1 = gimple_assign_rhs1 (def);
4120                   if (TREE_CODE (rhs1) == SSA_NAME
4121                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4122                       && (TYPE_PRECISION (TREE_TYPE (off))
4123                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4124                     def = SSA_NAME_DEF_STMT (rhs1);
4125                 }
4126               if (is_gimple_call (def)
4127                   && gimple_call_internal_p (def)
4128                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4129                 {
4130                   tree arg = gimple_call_arg (def, 0);
4131                   tree reft = TREE_TYPE (DR_REF (newdr));
4132                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4133                   arg = SSA_NAME_VAR (arg);
4134                   if (arg == loop->simduid
4135                       /* For now.  */
4136                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4137                     {
4138                       DR_BASE_ADDRESS (newdr) = base_address;
4139                       DR_OFFSET (newdr) = ssize_int (0);
4140                       DR_STEP (newdr) = step;
4141                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4142                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4143                       /* Mark as simd-lane access.  */
4144                       tree arg2 = gimple_call_arg (def, 1);
4145                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4146                       free_data_ref (dr);
4147                       datarefs->safe_push (newdr);
4148                       if (dataref_groups)
4149                         dataref_groups->safe_push (group_id);
4150                       return opt_result::success ();
4151                     }
4152                 }
4153             }
4154         }
4155       free_data_ref (newdr);
4156     }
4157
4158   datarefs->safe_push (dr);
4159   if (dataref_groups)
4160     dataref_groups->safe_push (group_id);
4161   return opt_result::success ();
4162 }
4163
4164 /* Function vect_analyze_data_refs.
4165
4166   Find all the data references in the loop or basic block.
4167
4168    The general structure of the analysis of data refs in the vectorizer is as
4169    follows:
4170    1- vect_analyze_data_refs(loop/bb): call
4171       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4172       in the loop/bb and their dependences.
4173    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4174    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4175    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4176
4177 */
4178
4179 opt_result
4180 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4181 {
4182   class loop *loop = NULL;
4183   unsigned int i;
4184   struct data_reference *dr;
4185   tree scalar_type;
4186
4187   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4188
4189   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4190     loop = LOOP_VINFO_LOOP (loop_vinfo);
4191
4192   /* Go through the data-refs, check that the analysis succeeded.  Update
4193      pointer from stmt_vec_info struct to DR and vectype.  */
4194
4195   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4196   FOR_EACH_VEC_ELT (datarefs, i, dr)
4197     {
4198       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4199       poly_uint64 vf;
4200
4201       gcc_assert (DR_REF (dr));
4202       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4203       gcc_assert (!stmt_info->dr_aux.dr);
4204       stmt_info->dr_aux.dr = dr;
4205       stmt_info->dr_aux.stmt = stmt_info;
4206
4207       /* Check that analysis of the data-ref succeeded.  */
4208       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4209           || !DR_STEP (dr))
4210         {
4211           bool maybe_gather
4212             = DR_IS_READ (dr)
4213               && !TREE_THIS_VOLATILE (DR_REF (dr))
4214               && (targetm.vectorize.builtin_gather != NULL
4215                   || supports_vec_gather_load_p ());
4216           bool maybe_scatter
4217             = DR_IS_WRITE (dr)
4218               && !TREE_THIS_VOLATILE (DR_REF (dr))
4219               && (targetm.vectorize.builtin_scatter != NULL
4220                   || supports_vec_scatter_store_p ());
4221
4222           /* If target supports vector gather loads or scatter stores,
4223              see if they can't be used.  */
4224           if (is_a <loop_vec_info> (vinfo)
4225               && !nested_in_vect_loop_p (loop, stmt_info))
4226             {
4227               if (maybe_gather || maybe_scatter)
4228                 {
4229                   if (maybe_gather)
4230                     gatherscatter = GATHER;
4231                   else
4232                     gatherscatter = SCATTER;
4233                 }
4234             }
4235
4236           if (gatherscatter == SG_NONE)
4237             {
4238               if (dump_enabled_p ())
4239                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4240                                  "not vectorized: data ref analysis "
4241                                  "failed %G", stmt_info->stmt);
4242               if (is_a <bb_vec_info> (vinfo))
4243                 {
4244                   /* In BB vectorization the ref can still participate
4245                      in dependence analysis, we just can't vectorize it.  */
4246                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4247                   continue;
4248                 }
4249               return opt_result::failure_at (stmt_info->stmt,
4250                                              "not vectorized:"
4251                                              " data ref analysis failed: %G",
4252                                              stmt_info->stmt);
4253             }
4254         }
4255
4256       /* See if this was detected as SIMD lane access.  */
4257       if (dr->aux == (void *)-1
4258           || dr->aux == (void *)-2
4259           || dr->aux == (void *)-3
4260           || dr->aux == (void *)-4)
4261         {
4262           if (nested_in_vect_loop_p (loop, stmt_info))
4263             return opt_result::failure_at (stmt_info->stmt,
4264                                            "not vectorized:"
4265                                            " data ref analysis failed: %G",
4266                                            stmt_info->stmt);
4267           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4268             = -(uintptr_t) dr->aux;
4269         }
4270
4271       tree base = get_base_address (DR_REF (dr));
4272       if (base && VAR_P (base) && DECL_NONALIASED (base))
4273         {
4274           if (dump_enabled_p ())
4275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4276                              "not vectorized: base object not addressable "
4277                              "for stmt: %G", stmt_info->stmt);
4278           if (is_a <bb_vec_info> (vinfo))
4279             {
4280               /* In BB vectorization the ref can still participate
4281                  in dependence analysis, we just can't vectorize it.  */
4282               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4283               continue;
4284             }
4285           return opt_result::failure_at (stmt_info->stmt,
4286                                          "not vectorized: base object not"
4287                                          " addressable for stmt: %G",
4288                                          stmt_info->stmt);
4289         }
4290
4291       if (is_a <loop_vec_info> (vinfo)
4292           && DR_STEP (dr)
4293           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4294         {
4295           if (nested_in_vect_loop_p (loop, stmt_info))
4296             return opt_result::failure_at (stmt_info->stmt,
4297                                            "not vectorized: "
4298                                            "not suitable for strided load %G",
4299                                            stmt_info->stmt);
4300           STMT_VINFO_STRIDED_P (stmt_info) = true;
4301         }
4302
4303       /* Update DR field in stmt_vec_info struct.  */
4304
4305       /* If the dataref is in an inner-loop of the loop that is considered for
4306          for vectorization, we also want to analyze the access relative to
4307          the outer-loop (DR contains information only relative to the
4308          inner-most enclosing loop).  We do that by building a reference to the
4309          first location accessed by the inner-loop, and analyze it relative to
4310          the outer-loop.  */
4311       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4312         {
4313           /* Build a reference to the first location accessed by the
4314              inner loop: *(BASE + INIT + OFFSET).  By construction,
4315              this address must be invariant in the inner loop, so we
4316              can consider it as being used in the outer loop.  */
4317           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4318           tree offset = unshare_expr (DR_OFFSET (dr));
4319           tree init = unshare_expr (DR_INIT (dr));
4320           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4321                                           init, offset);
4322           tree init_addr = fold_build_pointer_plus (base, init_offset);
4323           tree init_ref = build_fold_indirect_ref (init_addr);
4324
4325           if (dump_enabled_p ())
4326             dump_printf_loc (MSG_NOTE, vect_location,
4327                              "analyze in outer loop: %T\n", init_ref);
4328
4329           opt_result res
4330             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4331                                     init_ref, loop, stmt_info->stmt);
4332           if (!res)
4333             /* dr_analyze_innermost already explained the failure.  */
4334             return res;
4335
4336           if (dump_enabled_p ())
4337             dump_printf_loc (MSG_NOTE, vect_location,
4338                              "\touter base_address: %T\n"
4339                              "\touter offset from base address: %T\n"
4340                              "\touter constant offset from base address: %T\n"
4341                              "\touter step: %T\n"
4342                              "\touter base alignment: %d\n\n"
4343                              "\touter base misalignment: %d\n"
4344                              "\touter offset alignment: %d\n"
4345                              "\touter step alignment: %d\n",
4346                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4347                              STMT_VINFO_DR_OFFSET (stmt_info),
4348                              STMT_VINFO_DR_INIT (stmt_info),
4349                              STMT_VINFO_DR_STEP (stmt_info),
4350                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4351                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4352                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4353                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4354         }
4355
4356       /* Set vectype for STMT.  */
4357       scalar_type = TREE_TYPE (DR_REF (dr));
4358       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4359       if (!vectype)
4360         {
4361           if (dump_enabled_p ())
4362             {
4363               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4364                                "not vectorized: no vectype for stmt: %G",
4365                                stmt_info->stmt);
4366               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4367               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4368                                  scalar_type);
4369               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4370             }
4371
4372           if (is_a <bb_vec_info> (vinfo))
4373             {
4374               /* No vector type is fine, the ref can still participate
4375                  in dependence analysis, we just can't vectorize it.  */
4376               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4377               continue;
4378             }
4379           if (fatal)
4380             *fatal = false;
4381           return opt_result::failure_at (stmt_info->stmt,
4382                                          "not vectorized:"
4383                                          " no vectype for stmt: %G"
4384                                          " scalar_type: %T\n",
4385                                          stmt_info->stmt, scalar_type);
4386         }
4387       else
4388         {
4389           if (dump_enabled_p ())
4390             dump_printf_loc (MSG_NOTE, vect_location,
4391                              "got vectype for stmt: %G%T\n",
4392                              stmt_info->stmt, vectype);
4393         }
4394
4395       /* Adjust the minimal vectorization factor according to the
4396          vector type.  */
4397       vf = TYPE_VECTOR_SUBPARTS (vectype);
4398       *min_vf = upper_bound (*min_vf, vf);
4399
4400       /* Leave the BB vectorizer to pick the vector type later, based on
4401          the final dataref group size and SLP node size.  */
4402       if (is_a <loop_vec_info> (vinfo))
4403         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4404
4405       if (gatherscatter != SG_NONE)
4406         {
4407           gather_scatter_info gs_info;
4408           if (!vect_check_gather_scatter (stmt_info,
4409                                           as_a <loop_vec_info> (vinfo),
4410                                           &gs_info)
4411               || !get_vectype_for_scalar_type (vinfo,
4412                                                TREE_TYPE (gs_info.offset)))
4413             {
4414               if (fatal)
4415                 *fatal = false;
4416               return opt_result::failure_at
4417                         (stmt_info->stmt,
4418                          (gatherscatter == GATHER)
4419                          ? "not vectorized: not suitable for gather load %G"
4420                          : "not vectorized: not suitable for scatter store %G",
4421                          stmt_info->stmt);
4422             }
4423           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4424         }
4425     }
4426
4427   /* We used to stop processing and prune the list here.  Verify we no
4428      longer need to.  */
4429   gcc_assert (i == datarefs.length ());
4430
4431   return opt_result::success ();
4432 }
4433
4434
4435 /* Function vect_get_new_vect_var.
4436
4437    Returns a name for a new variable.  The current naming scheme appends the
4438    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4439    the name of vectorizer generated variables, and appends that to NAME if
4440    provided.  */
4441
4442 tree
4443 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4444 {
4445   const char *prefix;
4446   tree new_vect_var;
4447
4448   switch (var_kind)
4449   {
4450   case vect_simple_var:
4451     prefix = "vect";
4452     break;
4453   case vect_scalar_var:
4454     prefix = "stmp";
4455     break;
4456   case vect_mask_var:
4457     prefix = "mask";
4458     break;
4459   case vect_pointer_var:
4460     prefix = "vectp";
4461     break;
4462   default:
4463     gcc_unreachable ();
4464   }
4465
4466   if (name)
4467     {
4468       char* tmp = concat (prefix, "_", name, NULL);
4469       new_vect_var = create_tmp_reg (type, tmp);
4470       free (tmp);
4471     }
4472   else
4473     new_vect_var = create_tmp_reg (type, prefix);
4474
4475   return new_vect_var;
4476 }
4477
4478 /* Like vect_get_new_vect_var but return an SSA name.  */
4479
4480 tree
4481 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4482 {
4483   const char *prefix;
4484   tree new_vect_var;
4485
4486   switch (var_kind)
4487   {
4488   case vect_simple_var:
4489     prefix = "vect";
4490     break;
4491   case vect_scalar_var:
4492     prefix = "stmp";
4493     break;
4494   case vect_pointer_var:
4495     prefix = "vectp";
4496     break;
4497   default:
4498     gcc_unreachable ();
4499   }
4500
4501   if (name)
4502     {
4503       char* tmp = concat (prefix, "_", name, NULL);
4504       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4505       free (tmp);
4506     }
4507   else
4508     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4509
4510   return new_vect_var;
4511 }
4512
4513 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4514
4515 static void
4516 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4517 {
4518   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4519   int misalign = DR_MISALIGNMENT (dr_info);
4520   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4521     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4522   else
4523     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4524                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4525                             misalign);
4526 }
4527
4528 /* Function vect_create_addr_base_for_vector_ref.
4529
4530    Create an expression that computes the address of the first memory location
4531    that will be accessed for a data reference.
4532
4533    Input:
4534    STMT_INFO: The statement containing the data reference.
4535    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4536    OFFSET: Optional. If supplied, it is be added to the initial address.
4537    LOOP:    Specify relative to which loop-nest should the address be computed.
4538             For example, when the dataref is in an inner-loop nested in an
4539             outer-loop that is now being vectorized, LOOP can be either the
4540             outer-loop, or the inner-loop.  The first memory location accessed
4541             by the following dataref ('in' points to short):
4542
4543                 for (i=0; i<N; i++)
4544                    for (j=0; j<M; j++)
4545                      s += in[i+j]
4546
4547             is as follows:
4548             if LOOP=i_loop:     &in             (relative to i_loop)
4549             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4550    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4551             initial address.  Unlike OFFSET, which is number of elements to
4552             be added, BYTE_OFFSET is measured in bytes.
4553
4554    Output:
4555    1. Return an SSA_NAME whose value is the address of the memory location of
4556       the first vector of the data reference.
4557    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4558       these statement(s) which define the returned SSA_NAME.
4559
4560    FORNOW: We are only handling array accesses with step 1.  */
4561
4562 tree
4563 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4564                                       gimple_seq *new_stmt_list,
4565                                       tree offset,
4566                                       tree byte_offset)
4567 {
4568   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4569   struct data_reference *dr = dr_info->dr;
4570   const char *base_name;
4571   tree addr_base;
4572   tree dest;
4573   gimple_seq seq = NULL;
4574   tree vect_ptr_type;
4575   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4576   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4577   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4578
4579   tree data_ref_base = unshare_expr (drb->base_address);
4580   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4581   tree init = unshare_expr (drb->init);
4582
4583   if (loop_vinfo)
4584     base_name = get_name (data_ref_base);
4585   else
4586     {
4587       base_offset = ssize_int (0);
4588       init = ssize_int (0);
4589       base_name = get_name (DR_REF (dr));
4590     }
4591
4592   /* Create base_offset */
4593   base_offset = size_binop (PLUS_EXPR,
4594                             fold_convert (sizetype, base_offset),
4595                             fold_convert (sizetype, init));
4596
4597   if (offset)
4598     {
4599       offset = fold_build2 (MULT_EXPR, sizetype,
4600                             fold_convert (sizetype, offset), step);
4601       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4602                                  base_offset, offset);
4603     }
4604   if (byte_offset)
4605     {
4606       byte_offset = fold_convert (sizetype, byte_offset);
4607       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4608                                  base_offset, byte_offset);
4609     }
4610
4611   /* base + base_offset */
4612   if (loop_vinfo)
4613     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4614   else
4615     {
4616       addr_base = build1 (ADDR_EXPR,
4617                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4618                           unshare_expr (DR_REF (dr)));
4619     }
4620
4621   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4622   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4623   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4624   gimple_seq_add_seq (new_stmt_list, seq);
4625
4626   if (DR_PTR_INFO (dr)
4627       && TREE_CODE (addr_base) == SSA_NAME
4628       && !SSA_NAME_PTR_INFO (addr_base))
4629     {
4630       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4631       if (offset || byte_offset)
4632         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4633     }
4634
4635   if (dump_enabled_p ())
4636     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4637
4638   return addr_base;
4639 }
4640
4641
4642 /* Function vect_create_data_ref_ptr.
4643
4644    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4645    location accessed in the loop by STMT_INFO, along with the def-use update
4646    chain to appropriately advance the pointer through the loop iterations.
4647    Also set aliasing information for the pointer.  This pointer is used by
4648    the callers to this function to create a memory reference expression for
4649    vector load/store access.
4650
4651    Input:
4652    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4653          GIMPLE_ASSIGN <name, data-ref> or
4654          GIMPLE_ASSIGN <data-ref, name>.
4655    2. AGGR_TYPE: the type of the reference, which should be either a vector
4656         or an array.
4657    3. AT_LOOP: the loop where the vector memref is to be created.
4658    4. OFFSET (optional): an offset to be added to the initial address accessed
4659         by the data-ref in STMT_INFO.
4660    5. BSI: location where the new stmts are to be placed if there is no loop
4661    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4662         pointing to the initial address.
4663    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4664         to the initial address accessed by the data-ref in STMT_INFO.  This is
4665         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4666         in bytes.
4667    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4668         to the IV during each iteration of the loop.  NULL says to move
4669         by one copy of AGGR_TYPE up or down, depending on the step of the
4670         data reference.
4671
4672    Output:
4673    1. Declare a new ptr to vector_type, and have it point to the base of the
4674       data reference (initial addressed accessed by the data reference).
4675       For example, for vector of type V8HI, the following code is generated:
4676
4677       v8hi *ap;
4678       ap = (v8hi *)initial_address;
4679
4680       if OFFSET is not supplied:
4681          initial_address = &a[init];
4682       if OFFSET is supplied:
4683          initial_address = &a[init + OFFSET];
4684       if BYTE_OFFSET is supplied:
4685          initial_address = &a[init] + BYTE_OFFSET;
4686
4687       Return the initial_address in INITIAL_ADDRESS.
4688
4689    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4690       update the pointer in each iteration of the loop.
4691
4692       Return the increment stmt that updates the pointer in PTR_INCR.
4693
4694    3. Return the pointer.  */
4695
4696 tree
4697 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4698                           tree aggr_type, class loop *at_loop, tree offset,
4699                           tree *initial_address, gimple_stmt_iterator *gsi,
4700                           gimple **ptr_incr, bool only_init,
4701                           tree byte_offset, tree iv_step)
4702 {
4703   const char *base_name;
4704   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4705   class loop *loop = NULL;
4706   bool nested_in_vect_loop = false;
4707   class loop *containing_loop = NULL;
4708   tree aggr_ptr_type;
4709   tree aggr_ptr;
4710   tree new_temp;
4711   gimple_seq new_stmt_list = NULL;
4712   edge pe = NULL;
4713   basic_block new_bb;
4714   tree aggr_ptr_init;
4715   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4716   struct data_reference *dr = dr_info->dr;
4717   tree aptr;
4718   gimple_stmt_iterator incr_gsi;
4719   bool insert_after;
4720   tree indx_before_incr, indx_after_incr;
4721   gimple *incr;
4722   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4723
4724   gcc_assert (iv_step != NULL_TREE
4725               || TREE_CODE (aggr_type) == ARRAY_TYPE
4726               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4727
4728   if (loop_vinfo)
4729     {
4730       loop = LOOP_VINFO_LOOP (loop_vinfo);
4731       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4732       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4733       pe = loop_preheader_edge (loop);
4734     }
4735   else
4736     {
4737       gcc_assert (bb_vinfo);
4738       only_init = true;
4739       *ptr_incr = NULL;
4740     }
4741
4742   /* Create an expression for the first address accessed by this load
4743      in LOOP.  */
4744   base_name = get_name (DR_BASE_ADDRESS (dr));
4745
4746   if (dump_enabled_p ())
4747     {
4748       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4749       dump_printf_loc (MSG_NOTE, vect_location,
4750                        "create %s-pointer variable to type: %T",
4751                        get_tree_code_name (TREE_CODE (aggr_type)),
4752                        aggr_type);
4753       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4754         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4755       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4756         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4757       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4758         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4759       else
4760         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4761       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4762     }
4763
4764   /* (1) Create the new aggregate-pointer variable.
4765      Vector and array types inherit the alias set of their component
4766      type by default so we need to use a ref-all pointer if the data
4767      reference does not conflict with the created aggregated data
4768      reference because it is not addressable.  */
4769   bool need_ref_all = false;
4770   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4771                               get_alias_set (DR_REF (dr))))
4772     need_ref_all = true;
4773   /* Likewise for any of the data references in the stmt group.  */
4774   else if (DR_GROUP_SIZE (stmt_info) > 1)
4775     {
4776       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4777       do
4778         {
4779           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4780           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4781                                       get_alias_set (DR_REF (sdr))))
4782             {
4783               need_ref_all = true;
4784               break;
4785             }
4786           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4787         }
4788       while (sinfo);
4789     }
4790   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4791                                                need_ref_all);
4792   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4793
4794
4795   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4796      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4797      def-use update cycles for the pointer: one relative to the outer-loop
4798      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4799      to the inner-loop (which is the inner-most loop containing the dataref),
4800      and this is done be step (5) below.
4801
4802      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4803      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4804      redundant.  Steps (3),(4) create the following:
4805
4806         vp0 = &base_addr;
4807         LOOP:   vp1 = phi(vp0,vp2)
4808                 ...
4809                 ...
4810                 vp2 = vp1 + step
4811                 goto LOOP
4812
4813      If there is an inner-loop nested in loop, then step (5) will also be
4814      applied, and an additional update in the inner-loop will be created:
4815
4816         vp0 = &base_addr;
4817         LOOP:   vp1 = phi(vp0,vp2)
4818                 ...
4819         inner:     vp3 = phi(vp1,vp4)
4820                    vp4 = vp3 + inner_step
4821                    if () goto inner
4822                 ...
4823                 vp2 = vp1 + step
4824                 if () goto LOOP   */
4825
4826   /* (2) Calculate the initial address of the aggregate-pointer, and set
4827      the aggregate-pointer to point to it before the loop.  */
4828
4829   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4830
4831   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4832                                                    stmt_info, &new_stmt_list,
4833                                                    offset, byte_offset);
4834   if (new_stmt_list)
4835     {
4836       if (pe)
4837         {
4838           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4839           gcc_assert (!new_bb);
4840         }
4841       else
4842         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4843     }
4844
4845   *initial_address = new_temp;
4846   aggr_ptr_init = new_temp;
4847
4848   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4849      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4850      inner-loop nested in LOOP (during outer-loop vectorization).  */
4851
4852   /* No update in loop is required.  */
4853   if (only_init && (!loop_vinfo || at_loop == loop))
4854     aptr = aggr_ptr_init;
4855   else
4856     {
4857       /* Accesses to invariant addresses should be handled specially
4858          by the caller.  */
4859       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4860       gcc_assert (!integer_zerop (step));
4861
4862       if (iv_step == NULL_TREE)
4863         {
4864           /* The step of the aggregate pointer is the type size,
4865              negated for downward accesses.  */
4866           iv_step = TYPE_SIZE_UNIT (aggr_type);
4867           if (tree_int_cst_sgn (step) == -1)
4868             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4869         }
4870
4871       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4872
4873       create_iv (aggr_ptr_init,
4874                  fold_convert (aggr_ptr_type, iv_step),
4875                  aggr_ptr, loop, &incr_gsi, insert_after,
4876                  &indx_before_incr, &indx_after_incr);
4877       incr = gsi_stmt (incr_gsi);
4878
4879       /* Copy the points-to information if it exists. */
4880       if (DR_PTR_INFO (dr))
4881         {
4882           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4883           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4884         }
4885       if (ptr_incr)
4886         *ptr_incr = incr;
4887
4888       aptr = indx_before_incr;
4889     }
4890
4891   if (!nested_in_vect_loop || only_init)
4892     return aptr;
4893
4894
4895   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4896      nested in LOOP, if exists.  */
4897
4898   gcc_assert (nested_in_vect_loop);
4899   if (!only_init)
4900     {
4901       standard_iv_increment_position (containing_loop, &incr_gsi,
4902                                       &insert_after);
4903       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4904                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4905                  &indx_after_incr);
4906       incr = gsi_stmt (incr_gsi);
4907
4908       /* Copy the points-to information if it exists. */
4909       if (DR_PTR_INFO (dr))
4910         {
4911           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4912           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4913         }
4914       if (ptr_incr)
4915         *ptr_incr = incr;
4916
4917       return indx_before_incr;
4918     }
4919   else
4920     gcc_unreachable ();
4921 }
4922
4923
4924 /* Function bump_vector_ptr
4925
4926    Increment a pointer (to a vector type) by vector-size. If requested,
4927    i.e. if PTR-INCR is given, then also connect the new increment stmt
4928    to the existing def-use update-chain of the pointer, by modifying
4929    the PTR_INCR as illustrated below:
4930
4931    The pointer def-use update-chain before this function:
4932                         DATAREF_PTR = phi (p_0, p_2)
4933                         ....
4934         PTR_INCR:       p_2 = DATAREF_PTR + step
4935
4936    The pointer def-use update-chain after this function:
4937                         DATAREF_PTR = phi (p_0, p_2)
4938                         ....
4939                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4940                         ....
4941         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4942
4943    Input:
4944    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4945                  in the loop.
4946    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4947               the loop.  The increment amount across iterations is expected
4948               to be vector_size.
4949    BSI - location where the new update stmt is to be placed.
4950    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4951    BUMP - optional. The offset by which to bump the pointer. If not given,
4952           the offset is assumed to be vector_size.
4953
4954    Output: Return NEW_DATAREF_PTR as illustrated above.
4955
4956 */
4957
4958 tree
4959 bump_vector_ptr (vec_info *vinfo,
4960                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4961                  stmt_vec_info stmt_info, tree bump)
4962 {
4963   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4964   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4965   tree update = TYPE_SIZE_UNIT (vectype);
4966   gassign *incr_stmt;
4967   ssa_op_iter iter;
4968   use_operand_p use_p;
4969   tree new_dataref_ptr;
4970
4971   if (bump)
4972     update = bump;
4973
4974   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4975     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4976   else
4977     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4978   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4979                                    dataref_ptr, update);
4980   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
4981
4982   /* Copy the points-to information if it exists. */
4983   if (DR_PTR_INFO (dr))
4984     {
4985       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4986       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4987     }
4988
4989   if (!ptr_incr)
4990     return new_dataref_ptr;
4991
4992   /* Update the vector-pointer's cross-iteration increment.  */
4993   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4994     {
4995       tree use = USE_FROM_PTR (use_p);
4996
4997       if (use == dataref_ptr)
4998         SET_USE (use_p, new_dataref_ptr);
4999       else
5000         gcc_assert (operand_equal_p (use, update, 0));
5001     }
5002
5003   return new_dataref_ptr;
5004 }
5005
5006
5007 /* Copy memory reference info such as base/clique from the SRC reference
5008    to the DEST MEM_REF.  */
5009
5010 void
5011 vect_copy_ref_info (tree dest, tree src)
5012 {
5013   if (TREE_CODE (dest) != MEM_REF)
5014     return;
5015
5016   tree src_base = src;
5017   while (handled_component_p (src_base))
5018     src_base = TREE_OPERAND (src_base, 0);
5019   if (TREE_CODE (src_base) != MEM_REF
5020       && TREE_CODE (src_base) != TARGET_MEM_REF)
5021     return;
5022
5023   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5024   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5025 }
5026
5027
5028 /* Function vect_create_destination_var.
5029
5030    Create a new temporary of type VECTYPE.  */
5031
5032 tree
5033 vect_create_destination_var (tree scalar_dest, tree vectype)
5034 {
5035   tree vec_dest;
5036   const char *name;
5037   char *new_name;
5038   tree type;
5039   enum vect_var_kind kind;
5040
5041   kind = vectype
5042     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5043     ? vect_mask_var
5044     : vect_simple_var
5045     : vect_scalar_var;
5046   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5047
5048   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5049
5050   name = get_name (scalar_dest);
5051   if (name)
5052     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5053   else
5054     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5055   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5056   free (new_name);
5057
5058   return vec_dest;
5059 }
5060
5061 /* Function vect_grouped_store_supported.
5062
5063    Returns TRUE if interleave high and interleave low permutations
5064    are supported, and FALSE otherwise.  */
5065
5066 bool
5067 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5068 {
5069   machine_mode mode = TYPE_MODE (vectype);
5070
5071   /* vect_permute_store_chain requires the group size to be equal to 3 or
5072      be a power of two.  */
5073   if (count != 3 && exact_log2 (count) == -1)
5074     {
5075       if (dump_enabled_p ())
5076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5077                          "the size of the group of accesses"
5078                          " is not a power of 2 or not eqaul to 3\n");
5079       return false;
5080     }
5081
5082   /* Check that the permutation is supported.  */
5083   if (VECTOR_MODE_P (mode))
5084     {
5085       unsigned int i;
5086       if (count == 3)
5087         {
5088           unsigned int j0 = 0, j1 = 0, j2 = 0;
5089           unsigned int i, j;
5090
5091           unsigned int nelt;
5092           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5093             {
5094               if (dump_enabled_p ())
5095                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5096                                  "cannot handle groups of 3 stores for"
5097                                  " variable-length vectors\n");
5098               return false;
5099             }
5100
5101           vec_perm_builder sel (nelt, nelt, 1);
5102           sel.quick_grow (nelt);
5103           vec_perm_indices indices;
5104           for (j = 0; j < 3; j++)
5105             {
5106               int nelt0 = ((3 - j) * nelt) % 3;
5107               int nelt1 = ((3 - j) * nelt + 1) % 3;
5108               int nelt2 = ((3 - j) * nelt + 2) % 3;
5109               for (i = 0; i < nelt; i++)
5110                 {
5111                   if (3 * i + nelt0 < nelt)
5112                     sel[3 * i + nelt0] = j0++;
5113                   if (3 * i + nelt1 < nelt)
5114                     sel[3 * i + nelt1] = nelt + j1++;
5115                   if (3 * i + nelt2 < nelt)
5116                     sel[3 * i + nelt2] = 0;
5117                 }
5118               indices.new_vector (sel, 2, nelt);
5119               if (!can_vec_perm_const_p (mode, indices))
5120                 {
5121                   if (dump_enabled_p ())
5122                     dump_printf (MSG_MISSED_OPTIMIZATION,
5123                                  "permutation op not supported by target.\n");
5124                   return false;
5125                 }
5126
5127               for (i = 0; i < nelt; i++)
5128                 {
5129                   if (3 * i + nelt0 < nelt)
5130                     sel[3 * i + nelt0] = 3 * i + nelt0;
5131                   if (3 * i + nelt1 < nelt)
5132                     sel[3 * i + nelt1] = 3 * i + nelt1;
5133                   if (3 * i + nelt2 < nelt)
5134                     sel[3 * i + nelt2] = nelt + j2++;
5135                 }
5136               indices.new_vector (sel, 2, nelt);
5137               if (!can_vec_perm_const_p (mode, indices))
5138                 {
5139                   if (dump_enabled_p ())
5140                     dump_printf (MSG_MISSED_OPTIMIZATION,
5141                                  "permutation op not supported by target.\n");
5142                   return false;
5143                 }
5144             }
5145           return true;
5146         }
5147       else
5148         {
5149           /* If length is not equal to 3 then only power of 2 is supported.  */
5150           gcc_assert (pow2p_hwi (count));
5151           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5152
5153           /* The encoding has 2 interleaved stepped patterns.  */
5154           vec_perm_builder sel (nelt, 2, 3);
5155           sel.quick_grow (6);
5156           for (i = 0; i < 3; i++)
5157             {
5158               sel[i * 2] = i;
5159               sel[i * 2 + 1] = i + nelt;
5160             }
5161           vec_perm_indices indices (sel, 2, nelt);
5162           if (can_vec_perm_const_p (mode, indices))
5163             {
5164               for (i = 0; i < 6; i++)
5165                 sel[i] += exact_div (nelt, 2);
5166               indices.new_vector (sel, 2, nelt);
5167               if (can_vec_perm_const_p (mode, indices))
5168                 return true;
5169             }
5170         }
5171     }
5172
5173   if (dump_enabled_p ())
5174     dump_printf (MSG_MISSED_OPTIMIZATION,
5175                  "permutation op not supported by target.\n");
5176   return false;
5177 }
5178
5179
5180 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5181    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5182
5183 bool
5184 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5185                             bool masked_p)
5186 {
5187   if (masked_p)
5188     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5189                                          vec_mask_store_lanes_optab,
5190                                          vectype, count);
5191   else
5192     return vect_lanes_optab_supported_p ("vec_store_lanes",
5193                                          vec_store_lanes_optab,
5194                                          vectype, count);
5195 }
5196
5197
5198 /* Function vect_permute_store_chain.
5199
5200    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5201    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5202    the data correctly for the stores.  Return the final references for stores
5203    in RESULT_CHAIN.
5204
5205    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5206    The input is 4 vectors each containing 8 elements.  We assign a number to
5207    each element, the input sequence is:
5208
5209    1st vec:   0  1  2  3  4  5  6  7
5210    2nd vec:   8  9 10 11 12 13 14 15
5211    3rd vec:  16 17 18 19 20 21 22 23
5212    4th vec:  24 25 26 27 28 29 30 31
5213
5214    The output sequence should be:
5215
5216    1st vec:  0  8 16 24  1  9 17 25
5217    2nd vec:  2 10 18 26  3 11 19 27
5218    3rd vec:  4 12 20 28  5 13 21 30
5219    4th vec:  6 14 22 30  7 15 23 31
5220
5221    i.e., we interleave the contents of the four vectors in their order.
5222
5223    We use interleave_high/low instructions to create such output.  The input of
5224    each interleave_high/low operation is two vectors:
5225    1st vec    2nd vec
5226    0 1 2 3    4 5 6 7
5227    the even elements of the result vector are obtained left-to-right from the
5228    high/low elements of the first vector.  The odd elements of the result are
5229    obtained left-to-right from the high/low elements of the second vector.
5230    The output of interleave_high will be:   0 4 1 5
5231    and of interleave_low:                   2 6 3 7
5232
5233
5234    The permutation is done in log LENGTH stages.  In each stage interleave_high
5235    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5236    where the first argument is taken from the first half of DR_CHAIN and the
5237    second argument from it's second half.
5238    In our example,
5239
5240    I1: interleave_high (1st vec, 3rd vec)
5241    I2: interleave_low (1st vec, 3rd vec)
5242    I3: interleave_high (2nd vec, 4th vec)
5243    I4: interleave_low (2nd vec, 4th vec)
5244
5245    The output for the first stage is:
5246
5247    I1:  0 16  1 17  2 18  3 19
5248    I2:  4 20  5 21  6 22  7 23
5249    I3:  8 24  9 25 10 26 11 27
5250    I4: 12 28 13 29 14 30 15 31
5251
5252    The output of the second stage, i.e. the final result is:
5253
5254    I1:  0  8 16 24  1  9 17 25
5255    I2:  2 10 18 26  3 11 19 27
5256    I3:  4 12 20 28  5 13 21 30
5257    I4:  6 14 22 30  7 15 23 31.  */
5258
5259 void
5260 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5261                           unsigned int length,
5262                           stmt_vec_info stmt_info,
5263                           gimple_stmt_iterator *gsi,
5264                           vec<tree> *result_chain)
5265 {
5266   tree vect1, vect2, high, low;
5267   gimple *perm_stmt;
5268   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5269   tree perm_mask_low, perm_mask_high;
5270   tree data_ref;
5271   tree perm3_mask_low, perm3_mask_high;
5272   unsigned int i, j, n, log_length = exact_log2 (length);
5273
5274   result_chain->quick_grow (length);
5275   memcpy (result_chain->address (), dr_chain.address (),
5276           length * sizeof (tree));
5277
5278   if (length == 3)
5279     {
5280       /* vect_grouped_store_supported ensures that this is constant.  */
5281       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5282       unsigned int j0 = 0, j1 = 0, j2 = 0;
5283
5284       vec_perm_builder sel (nelt, nelt, 1);
5285       sel.quick_grow (nelt);
5286       vec_perm_indices indices;
5287       for (j = 0; j < 3; j++)
5288         {
5289           int nelt0 = ((3 - j) * nelt) % 3;
5290           int nelt1 = ((3 - j) * nelt + 1) % 3;
5291           int nelt2 = ((3 - j) * nelt + 2) % 3;
5292
5293           for (i = 0; i < nelt; i++)
5294             {
5295               if (3 * i + nelt0 < nelt)
5296                 sel[3 * i + nelt0] = j0++;
5297               if (3 * i + nelt1 < nelt)
5298                 sel[3 * i + nelt1] = nelt + j1++;
5299               if (3 * i + nelt2 < nelt)
5300                 sel[3 * i + nelt2] = 0;
5301             }
5302           indices.new_vector (sel, 2, nelt);
5303           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5304
5305           for (i = 0; i < nelt; i++)
5306             {
5307               if (3 * i + nelt0 < nelt)
5308                 sel[3 * i + nelt0] = 3 * i + nelt0;
5309               if (3 * i + nelt1 < nelt)
5310                 sel[3 * i + nelt1] = 3 * i + nelt1;
5311               if (3 * i + nelt2 < nelt)
5312                 sel[3 * i + nelt2] = nelt + j2++;
5313             }
5314           indices.new_vector (sel, 2, nelt);
5315           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5316
5317           vect1 = dr_chain[0];
5318           vect2 = dr_chain[1];
5319
5320           /* Create interleaving stmt:
5321              low = VEC_PERM_EXPR <vect1, vect2,
5322                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5323                                    j + 2, nelt + j + 2, *, ...}>  */
5324           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5325           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5326                                            vect2, perm3_mask_low);
5327           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5328
5329           vect1 = data_ref;
5330           vect2 = dr_chain[2];
5331           /* Create interleaving stmt:
5332              low = VEC_PERM_EXPR <vect1, vect2,
5333                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5334                                    6, 7, nelt + j + 2, ...}>  */
5335           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5336           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5337                                            vect2, perm3_mask_high);
5338           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5339           (*result_chain)[j] = data_ref;
5340         }
5341     }
5342   else
5343     {
5344       /* If length is not equal to 3 then only power of 2 is supported.  */
5345       gcc_assert (pow2p_hwi (length));
5346
5347       /* The encoding has 2 interleaved stepped patterns.  */
5348       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5349       vec_perm_builder sel (nelt, 2, 3);
5350       sel.quick_grow (6);
5351       for (i = 0; i < 3; i++)
5352         {
5353           sel[i * 2] = i;
5354           sel[i * 2 + 1] = i + nelt;
5355         }
5356         vec_perm_indices indices (sel, 2, nelt);
5357         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5358
5359         for (i = 0; i < 6; i++)
5360           sel[i] += exact_div (nelt, 2);
5361         indices.new_vector (sel, 2, nelt);
5362         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5363
5364         for (i = 0, n = log_length; i < n; i++)
5365           {
5366             for (j = 0; j < length/2; j++)
5367               {
5368                 vect1 = dr_chain[j];
5369                 vect2 = dr_chain[j+length/2];
5370
5371                 /* Create interleaving stmt:
5372                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5373                                                         ...}>  */
5374                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5375                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5376                                                  vect2, perm_mask_high);
5377                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5378                 (*result_chain)[2*j] = high;
5379
5380                 /* Create interleaving stmt:
5381                    low = VEC_PERM_EXPR <vect1, vect2,
5382                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5383                                          ...}>  */
5384                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5385                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5386                                                  vect2, perm_mask_low);
5387                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5388                 (*result_chain)[2*j+1] = low;
5389               }
5390             memcpy (dr_chain.address (), result_chain->address (),
5391                     length * sizeof (tree));
5392           }
5393     }
5394 }
5395
5396 /* Function vect_setup_realignment
5397
5398    This function is called when vectorizing an unaligned load using
5399    the dr_explicit_realign[_optimized] scheme.
5400    This function generates the following code at the loop prolog:
5401
5402       p = initial_addr;
5403    x  msq_init = *(floor(p));   # prolog load
5404       realignment_token = call target_builtin;
5405     loop:
5406    x  msq = phi (msq_init, ---)
5407
5408    The stmts marked with x are generated only for the case of
5409    dr_explicit_realign_optimized.
5410
5411    The code above sets up a new (vector) pointer, pointing to the first
5412    location accessed by STMT_INFO, and a "floor-aligned" load using that
5413    pointer.  It also generates code to compute the "realignment-token"
5414    (if the relevant target hook was defined), and creates a phi-node at the
5415    loop-header bb whose arguments are the result of the prolog-load (created
5416    by this function) and the result of a load that takes place in the loop
5417    (to be created by the caller to this function).
5418
5419    For the case of dr_explicit_realign_optimized:
5420    The caller to this function uses the phi-result (msq) to create the
5421    realignment code inside the loop, and sets up the missing phi argument,
5422    as follows:
5423     loop:
5424       msq = phi (msq_init, lsq)
5425       lsq = *(floor(p'));        # load in loop
5426       result = realign_load (msq, lsq, realignment_token);
5427
5428    For the case of dr_explicit_realign:
5429     loop:
5430       msq = *(floor(p));        # load in loop
5431       p' = p + (VS-1);
5432       lsq = *(floor(p'));       # load in loop
5433       result = realign_load (msq, lsq, realignment_token);
5434
5435    Input:
5436    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5437                a memory location that may be unaligned.
5438    BSI - place where new code is to be inserted.
5439    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5440                               is used.
5441
5442    Output:
5443    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5444                        target hook, if defined.
5445    Return value - the result of the loop-header phi node.  */
5446
5447 tree
5448 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5449                         gimple_stmt_iterator *gsi, tree *realignment_token,
5450                         enum dr_alignment_support alignment_support_scheme,
5451                         tree init_addr,
5452                         class loop **at_loop)
5453 {
5454   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5455   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5456   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5457   struct data_reference *dr = dr_info->dr;
5458   class loop *loop = NULL;
5459   edge pe = NULL;
5460   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5461   tree vec_dest;
5462   gimple *inc;
5463   tree ptr;
5464   tree data_ref;
5465   basic_block new_bb;
5466   tree msq_init = NULL_TREE;
5467   tree new_temp;
5468   gphi *phi_stmt;
5469   tree msq = NULL_TREE;
5470   gimple_seq stmts = NULL;
5471   bool compute_in_loop = false;
5472   bool nested_in_vect_loop = false;
5473   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5474   class loop *loop_for_initial_load = NULL;
5475
5476   if (loop_vinfo)
5477     {
5478       loop = LOOP_VINFO_LOOP (loop_vinfo);
5479       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5480     }
5481
5482   gcc_assert (alignment_support_scheme == dr_explicit_realign
5483               || alignment_support_scheme == dr_explicit_realign_optimized);
5484
5485   /* We need to generate three things:
5486      1. the misalignment computation
5487      2. the extra vector load (for the optimized realignment scheme).
5488      3. the phi node for the two vectors from which the realignment is
5489       done (for the optimized realignment scheme).  */
5490
5491   /* 1. Determine where to generate the misalignment computation.
5492
5493      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5494      calculation will be generated by this function, outside the loop (in the
5495      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5496      caller, inside the loop.
5497
5498      Background: If the misalignment remains fixed throughout the iterations of
5499      the loop, then both realignment schemes are applicable, and also the
5500      misalignment computation can be done outside LOOP.  This is because we are
5501      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5502      are a multiple of VS (the Vector Size), and therefore the misalignment in
5503      different vectorized LOOP iterations is always the same.
5504      The problem arises only if the memory access is in an inner-loop nested
5505      inside LOOP, which is now being vectorized using outer-loop vectorization.
5506      This is the only case when the misalignment of the memory access may not
5507      remain fixed throughout the iterations of the inner-loop (as explained in
5508      detail in vect_supportable_dr_alignment).  In this case, not only is the
5509      optimized realignment scheme not applicable, but also the misalignment
5510      computation (and generation of the realignment token that is passed to
5511      REALIGN_LOAD) have to be done inside the loop.
5512
5513      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5514      or not, which in turn determines if the misalignment is computed inside
5515      the inner-loop, or outside LOOP.  */
5516
5517   if (init_addr != NULL_TREE || !loop_vinfo)
5518     {
5519       compute_in_loop = true;
5520       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5521     }
5522
5523
5524   /* 2. Determine where to generate the extra vector load.
5525
5526      For the optimized realignment scheme, instead of generating two vector
5527      loads in each iteration, we generate a single extra vector load in the
5528      preheader of the loop, and in each iteration reuse the result of the
5529      vector load from the previous iteration.  In case the memory access is in
5530      an inner-loop nested inside LOOP, which is now being vectorized using
5531      outer-loop vectorization, we need to determine whether this initial vector
5532      load should be generated at the preheader of the inner-loop, or can be
5533      generated at the preheader of LOOP.  If the memory access has no evolution
5534      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5535      to be generated inside LOOP (in the preheader of the inner-loop).  */
5536
5537   if (nested_in_vect_loop)
5538     {
5539       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5540       bool invariant_in_outerloop =
5541             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5542       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5543     }
5544   else
5545     loop_for_initial_load = loop;
5546   if (at_loop)
5547     *at_loop = loop_for_initial_load;
5548
5549   if (loop_for_initial_load)
5550     pe = loop_preheader_edge (loop_for_initial_load);
5551
5552   /* 3. For the case of the optimized realignment, create the first vector
5553       load at the loop preheader.  */
5554
5555   if (alignment_support_scheme == dr_explicit_realign_optimized)
5556     {
5557       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5558       gassign *new_stmt;
5559
5560       gcc_assert (!compute_in_loop);
5561       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5562       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5563                                       loop_for_initial_load, NULL_TREE,
5564                                       &init_addr, NULL, &inc, true);
5565       if (TREE_CODE (ptr) == SSA_NAME)
5566         new_temp = copy_ssa_name (ptr);
5567       else
5568         new_temp = make_ssa_name (TREE_TYPE (ptr));
5569       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5570       tree type = TREE_TYPE (ptr);
5571       new_stmt = gimple_build_assign
5572                    (new_temp, BIT_AND_EXPR, ptr,
5573                     fold_build2 (MINUS_EXPR, type,
5574                                  build_int_cst (type, 0),
5575                                  build_int_cst (type, align)));
5576       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5577       gcc_assert (!new_bb);
5578       data_ref
5579         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5580                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5581       vect_copy_ref_info (data_ref, DR_REF (dr));
5582       new_stmt = gimple_build_assign (vec_dest, data_ref);
5583       new_temp = make_ssa_name (vec_dest, new_stmt);
5584       gimple_assign_set_lhs (new_stmt, new_temp);
5585       if (pe)
5586         {
5587           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5588           gcc_assert (!new_bb);
5589         }
5590       else
5591          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5592
5593       msq_init = gimple_assign_lhs (new_stmt);
5594     }
5595
5596   /* 4. Create realignment token using a target builtin, if available.
5597       It is done either inside the containing loop, or before LOOP (as
5598       determined above).  */
5599
5600   if (targetm.vectorize.builtin_mask_for_load)
5601     {
5602       gcall *new_stmt;
5603       tree builtin_decl;
5604
5605       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5606       if (!init_addr)
5607         {
5608           /* Generate the INIT_ADDR computation outside LOOP.  */
5609           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5610                                                             stmt_info, &stmts,
5611                                                             NULL_TREE);
5612           if (loop)
5613             {
5614               pe = loop_preheader_edge (loop);
5615               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5616               gcc_assert (!new_bb);
5617             }
5618           else
5619              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5620         }
5621
5622       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5623       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5624       vec_dest =
5625         vect_create_destination_var (scalar_dest,
5626                                      gimple_call_return_type (new_stmt));
5627       new_temp = make_ssa_name (vec_dest, new_stmt);
5628       gimple_call_set_lhs (new_stmt, new_temp);
5629
5630       if (compute_in_loop)
5631         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5632       else
5633         {
5634           /* Generate the misalignment computation outside LOOP.  */
5635           pe = loop_preheader_edge (loop);
5636           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5637           gcc_assert (!new_bb);
5638         }
5639
5640       *realignment_token = gimple_call_lhs (new_stmt);
5641
5642       /* The result of the CALL_EXPR to this builtin is determined from
5643          the value of the parameter and no global variables are touched
5644          which makes the builtin a "const" function.  Requiring the
5645          builtin to have the "const" attribute makes it unnecessary
5646          to call mark_call_clobbered.  */
5647       gcc_assert (TREE_READONLY (builtin_decl));
5648     }
5649
5650   if (alignment_support_scheme == dr_explicit_realign)
5651     return msq;
5652
5653   gcc_assert (!compute_in_loop);
5654   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5655
5656
5657   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5658
5659   pe = loop_preheader_edge (containing_loop);
5660   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5661   msq = make_ssa_name (vec_dest);
5662   phi_stmt = create_phi_node (msq, containing_loop->header);
5663   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5664
5665   return msq;
5666 }
5667
5668
5669 /* Function vect_grouped_load_supported.
5670
5671    COUNT is the size of the load group (the number of statements plus the
5672    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5673    only one statement, with a gap of COUNT - 1.
5674
5675    Returns true if a suitable permute exists.  */
5676
5677 bool
5678 vect_grouped_load_supported (tree vectype, bool single_element_p,
5679                              unsigned HOST_WIDE_INT count)
5680 {
5681   machine_mode mode = TYPE_MODE (vectype);
5682
5683   /* If this is single-element interleaving with an element distance
5684      that leaves unused vector loads around punt - we at least create
5685      very sub-optimal code in that case (and blow up memory,
5686      see PR65518).  */
5687   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5688     {
5689       if (dump_enabled_p ())
5690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691                          "single-element interleaving not supported "
5692                          "for not adjacent vector loads\n");
5693       return false;
5694     }
5695
5696   /* vect_permute_load_chain requires the group size to be equal to 3 or
5697      be a power of two.  */
5698   if (count != 3 && exact_log2 (count) == -1)
5699     {
5700       if (dump_enabled_p ())
5701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                          "the size of the group of accesses"
5703                          " is not a power of 2 or not equal to 3\n");
5704       return false;
5705     }
5706
5707   /* Check that the permutation is supported.  */
5708   if (VECTOR_MODE_P (mode))
5709     {
5710       unsigned int i, j;
5711       if (count == 3)
5712         {
5713           unsigned int nelt;
5714           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5715             {
5716               if (dump_enabled_p ())
5717                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718                                  "cannot handle groups of 3 loads for"
5719                                  " variable-length vectors\n");
5720               return false;
5721             }
5722
5723           vec_perm_builder sel (nelt, nelt, 1);
5724           sel.quick_grow (nelt);
5725           vec_perm_indices indices;
5726           unsigned int k;
5727           for (k = 0; k < 3; k++)
5728             {
5729               for (i = 0; i < nelt; i++)
5730                 if (3 * i + k < 2 * nelt)
5731                   sel[i] = 3 * i + k;
5732                 else
5733                   sel[i] = 0;
5734               indices.new_vector (sel, 2, nelt);
5735               if (!can_vec_perm_const_p (mode, indices))
5736                 {
5737                   if (dump_enabled_p ())
5738                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5739                                      "shuffle of 3 loads is not supported by"
5740                                      " target\n");
5741                   return false;
5742                 }
5743               for (i = 0, j = 0; i < nelt; i++)
5744                 if (3 * i + k < 2 * nelt)
5745                   sel[i] = i;
5746                 else
5747                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5748               indices.new_vector (sel, 2, nelt);
5749               if (!can_vec_perm_const_p (mode, indices))
5750                 {
5751                   if (dump_enabled_p ())
5752                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753                                      "shuffle of 3 loads is not supported by"
5754                                      " target\n");
5755                   return false;
5756                 }
5757             }
5758           return true;
5759         }
5760       else
5761         {
5762           /* If length is not equal to 3 then only power of 2 is supported.  */
5763           gcc_assert (pow2p_hwi (count));
5764           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5765
5766           /* The encoding has a single stepped pattern.  */
5767           vec_perm_builder sel (nelt, 1, 3);
5768           sel.quick_grow (3);
5769           for (i = 0; i < 3; i++)
5770             sel[i] = i * 2;
5771           vec_perm_indices indices (sel, 2, nelt);
5772           if (can_vec_perm_const_p (mode, indices))
5773             {
5774               for (i = 0; i < 3; i++)
5775                 sel[i] = i * 2 + 1;
5776               indices.new_vector (sel, 2, nelt);
5777               if (can_vec_perm_const_p (mode, indices))
5778                 return true;
5779             }
5780         }
5781     }
5782
5783   if (dump_enabled_p ())
5784     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5785                      "extract even/odd not supported by target\n");
5786   return false;
5787 }
5788
5789 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5790    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5791
5792 bool
5793 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5794                            bool masked_p)
5795 {
5796   if (masked_p)
5797     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5798                                          vec_mask_load_lanes_optab,
5799                                          vectype, count);
5800   else
5801     return vect_lanes_optab_supported_p ("vec_load_lanes",
5802                                          vec_load_lanes_optab,
5803                                          vectype, count);
5804 }
5805
5806 /* Function vect_permute_load_chain.
5807
5808    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5809    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5810    the input data correctly.  Return the final references for loads in
5811    RESULT_CHAIN.
5812
5813    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5814    The input is 4 vectors each containing 8 elements. We assign a number to each
5815    element, the input sequence is:
5816
5817    1st vec:   0  1  2  3  4  5  6  7
5818    2nd vec:   8  9 10 11 12 13 14 15
5819    3rd vec:  16 17 18 19 20 21 22 23
5820    4th vec:  24 25 26 27 28 29 30 31
5821
5822    The output sequence should be:
5823
5824    1st vec:  0 4  8 12 16 20 24 28
5825    2nd vec:  1 5  9 13 17 21 25 29
5826    3rd vec:  2 6 10 14 18 22 26 30
5827    4th vec:  3 7 11 15 19 23 27 31
5828
5829    i.e., the first output vector should contain the first elements of each
5830    interleaving group, etc.
5831
5832    We use extract_even/odd instructions to create such output.  The input of
5833    each extract_even/odd operation is two vectors
5834    1st vec    2nd vec
5835    0 1 2 3    4 5 6 7
5836
5837    and the output is the vector of extracted even/odd elements.  The output of
5838    extract_even will be:   0 2 4 6
5839    and of extract_odd:     1 3 5 7
5840
5841
5842    The permutation is done in log LENGTH stages.  In each stage extract_even
5843    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5844    their order.  In our example,
5845
5846    E1: extract_even (1st vec, 2nd vec)
5847    E2: extract_odd (1st vec, 2nd vec)
5848    E3: extract_even (3rd vec, 4th vec)
5849    E4: extract_odd (3rd vec, 4th vec)
5850
5851    The output for the first stage will be:
5852
5853    E1:  0  2  4  6  8 10 12 14
5854    E2:  1  3  5  7  9 11 13 15
5855    E3: 16 18 20 22 24 26 28 30
5856    E4: 17 19 21 23 25 27 29 31
5857
5858    In order to proceed and create the correct sequence for the next stage (or
5859    for the correct output, if the second stage is the last one, as in our
5860    example), we first put the output of extract_even operation and then the
5861    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5862    The input for the second stage is:
5863
5864    1st vec (E1):  0  2  4  6  8 10 12 14
5865    2nd vec (E3): 16 18 20 22 24 26 28 30
5866    3rd vec (E2):  1  3  5  7  9 11 13 15
5867    4th vec (E4): 17 19 21 23 25 27 29 31
5868
5869    The output of the second stage:
5870
5871    E1: 0 4  8 12 16 20 24 28
5872    E2: 2 6 10 14 18 22 26 30
5873    E3: 1 5  9 13 17 21 25 29
5874    E4: 3 7 11 15 19 23 27 31
5875
5876    And RESULT_CHAIN after reordering:
5877
5878    1st vec (E1):  0 4  8 12 16 20 24 28
5879    2nd vec (E3):  1 5  9 13 17 21 25 29
5880    3rd vec (E2):  2 6 10 14 18 22 26 30
5881    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5882
5883 static void
5884 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5885                          unsigned int length,
5886                          stmt_vec_info stmt_info,
5887                          gimple_stmt_iterator *gsi,
5888                          vec<tree> *result_chain)
5889 {
5890   tree data_ref, first_vect, second_vect;
5891   tree perm_mask_even, perm_mask_odd;
5892   tree perm3_mask_low, perm3_mask_high;
5893   gimple *perm_stmt;
5894   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5895   unsigned int i, j, log_length = exact_log2 (length);
5896
5897   result_chain->quick_grow (length);
5898   memcpy (result_chain->address (), dr_chain.address (),
5899           length * sizeof (tree));
5900
5901   if (length == 3)
5902     {
5903       /* vect_grouped_load_supported ensures that this is constant.  */
5904       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5905       unsigned int k;
5906
5907       vec_perm_builder sel (nelt, nelt, 1);
5908       sel.quick_grow (nelt);
5909       vec_perm_indices indices;
5910       for (k = 0; k < 3; k++)
5911         {
5912           for (i = 0; i < nelt; i++)
5913             if (3 * i + k < 2 * nelt)
5914               sel[i] = 3 * i + k;
5915             else
5916               sel[i] = 0;
5917           indices.new_vector (sel, 2, nelt);
5918           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5919
5920           for (i = 0, j = 0; i < nelt; i++)
5921             if (3 * i + k < 2 * nelt)
5922               sel[i] = i;
5923             else
5924               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5925           indices.new_vector (sel, 2, nelt);
5926           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5927
5928           first_vect = dr_chain[0];
5929           second_vect = dr_chain[1];
5930
5931           /* Create interleaving stmt (low part of):
5932              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5933                                                              ...}>  */
5934           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5935           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5936                                            second_vect, perm3_mask_low);
5937           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5938
5939           /* Create interleaving stmt (high part of):
5940              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5941                                                               ...}>  */
5942           first_vect = data_ref;
5943           second_vect = dr_chain[2];
5944           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5945           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5946                                            second_vect, perm3_mask_high);
5947           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5948           (*result_chain)[k] = data_ref;
5949         }
5950     }
5951   else
5952     {
5953       /* If length is not equal to 3 then only power of 2 is supported.  */
5954       gcc_assert (pow2p_hwi (length));
5955
5956       /* The encoding has a single stepped pattern.  */
5957       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5958       vec_perm_builder sel (nelt, 1, 3);
5959       sel.quick_grow (3);
5960       for (i = 0; i < 3; ++i)
5961         sel[i] = i * 2;
5962       vec_perm_indices indices (sel, 2, nelt);
5963       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5964
5965       for (i = 0; i < 3; ++i)
5966         sel[i] = i * 2 + 1;
5967       indices.new_vector (sel, 2, nelt);
5968       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5969
5970       for (i = 0; i < log_length; i++)
5971         {
5972           for (j = 0; j < length; j += 2)
5973             {
5974               first_vect = dr_chain[j];
5975               second_vect = dr_chain[j+1];
5976
5977               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5978               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5979               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5980                                                first_vect, second_vect,
5981                                                perm_mask_even);
5982               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5983               (*result_chain)[j/2] = data_ref;
5984
5985               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5986               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5987               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5988                                                first_vect, second_vect,
5989                                                perm_mask_odd);
5990               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5991               (*result_chain)[j/2+length/2] = data_ref;
5992             }
5993           memcpy (dr_chain.address (), result_chain->address (),
5994                   length * sizeof (tree));
5995         }
5996     }
5997 }
5998
5999 /* Function vect_shift_permute_load_chain.
6000
6001    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6002    sequence of stmts to reorder the input data accordingly.
6003    Return the final references for loads in RESULT_CHAIN.
6004    Return true if successed, false otherwise.
6005
6006    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6007    The input is 3 vectors each containing 8 elements.  We assign a
6008    number to each element, the input sequence is:
6009
6010    1st vec:   0  1  2  3  4  5  6  7
6011    2nd vec:   8  9 10 11 12 13 14 15
6012    3rd vec:  16 17 18 19 20 21 22 23
6013
6014    The output sequence should be:
6015
6016    1st vec:  0 3 6  9 12 15 18 21
6017    2nd vec:  1 4 7 10 13 16 19 22
6018    3rd vec:  2 5 8 11 14 17 20 23
6019
6020    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6021
6022    First we shuffle all 3 vectors to get correct elements order:
6023
6024    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6025    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6026    3rd vec:  (16 19 22) (17 20 23) (18 21)
6027
6028    Next we unite and shift vector 3 times:
6029
6030    1st step:
6031      shift right by 6 the concatenation of:
6032      "1st vec" and  "2nd vec"
6033        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6034      "2nd vec" and  "3rd vec"
6035        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6036      "3rd vec" and  "1st vec"
6037        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6038                              | New vectors                   |
6039
6040      So that now new vectors are:
6041
6042      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6043      2nd vec:  (10 13) (16 19 22) (17 20 23)
6044      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6045
6046    2nd step:
6047      shift right by 5 the concatenation of:
6048      "1st vec" and  "3rd vec"
6049        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6050      "2nd vec" and  "1st vec"
6051        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6052      "3rd vec" and  "2nd vec"
6053        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6054                           | New vectors                   |
6055
6056      So that now new vectors are:
6057
6058      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6059      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6060      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6061
6062    3rd step:
6063      shift right by 5 the concatenation of:
6064      "1st vec" and  "1st vec"
6065        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6066      shift right by 3 the concatenation of:
6067      "2nd vec" and  "2nd vec"
6068                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6069                           | New vectors                   |
6070
6071      So that now all vectors are READY:
6072      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6073      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6074      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6075
6076    This algorithm is faster than one in vect_permute_load_chain if:
6077      1.  "shift of a concatination" is faster than general permutation.
6078          This is usually so.
6079      2.  The TARGET machine can't execute vector instructions in parallel.
6080          This is because each step of the algorithm depends on previous.
6081          The algorithm in vect_permute_load_chain is much more parallel.
6082
6083    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6084 */
6085
6086 static bool
6087 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6088                                unsigned int length,
6089                                stmt_vec_info stmt_info,
6090                                gimple_stmt_iterator *gsi,
6091                                vec<tree> *result_chain)
6092 {
6093   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6094   tree perm2_mask1, perm2_mask2, perm3_mask;
6095   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6096   gimple *perm_stmt;
6097
6098   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6099   unsigned int i;
6100   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6101
6102   unsigned HOST_WIDE_INT nelt, vf;
6103   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6104       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6105     /* Not supported for variable-length vectors.  */
6106     return false;
6107
6108   vec_perm_builder sel (nelt, nelt, 1);
6109   sel.quick_grow (nelt);
6110
6111   result_chain->quick_grow (length);
6112   memcpy (result_chain->address (), dr_chain.address (),
6113           length * sizeof (tree));
6114
6115   if (pow2p_hwi (length) && vf > 4)
6116     {
6117       unsigned int j, log_length = exact_log2 (length);
6118       for (i = 0; i < nelt / 2; ++i)
6119         sel[i] = i * 2;
6120       for (i = 0; i < nelt / 2; ++i)
6121         sel[nelt / 2 + i] = i * 2 + 1;
6122       vec_perm_indices indices (sel, 2, nelt);
6123       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6124         {
6125           if (dump_enabled_p ())
6126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127                              "shuffle of 2 fields structure is not \
6128                               supported by target\n");
6129           return false;
6130         }
6131       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6132
6133       for (i = 0; i < nelt / 2; ++i)
6134         sel[i] = i * 2 + 1;
6135       for (i = 0; i < nelt / 2; ++i)
6136         sel[nelt / 2 + i] = i * 2;
6137       indices.new_vector (sel, 2, nelt);
6138       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6139         {
6140           if (dump_enabled_p ())
6141             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6142                              "shuffle of 2 fields structure is not \
6143                               supported by target\n");
6144           return false;
6145         }
6146       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6147
6148       /* Generating permutation constant to shift all elements.
6149          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6150       for (i = 0; i < nelt; i++)
6151         sel[i] = nelt / 2 + i;
6152       indices.new_vector (sel, 2, nelt);
6153       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6154         {
6155           if (dump_enabled_p ())
6156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6157                              "shift permutation is not supported by target\n");
6158           return false;
6159         }
6160       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6161
6162       /* Generating permutation constant to select vector from 2.
6163          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6164       for (i = 0; i < nelt / 2; i++)
6165         sel[i] = i;
6166       for (i = nelt / 2; i < nelt; i++)
6167         sel[i] = nelt + i;
6168       indices.new_vector (sel, 2, nelt);
6169       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6170         {
6171           if (dump_enabled_p ())
6172             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6173                              "select is not supported by target\n");
6174           return false;
6175         }
6176       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6177
6178       for (i = 0; i < log_length; i++)
6179         {
6180           for (j = 0; j < length; j += 2)
6181             {
6182               first_vect = dr_chain[j];
6183               second_vect = dr_chain[j + 1];
6184
6185               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6186               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187                                                first_vect, first_vect,
6188                                                perm2_mask1);
6189               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6190               vect[0] = data_ref;
6191
6192               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6193               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6194                                                second_vect, second_vect,
6195                                                perm2_mask2);
6196               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6197               vect[1] = data_ref;
6198
6199               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6200               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6201                                                vect[0], vect[1], shift1_mask);
6202               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6203               (*result_chain)[j/2 + length/2] = data_ref;
6204
6205               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6206               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6207                                                vect[0], vect[1], select_mask);
6208               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6209               (*result_chain)[j/2] = data_ref;
6210             }
6211           memcpy (dr_chain.address (), result_chain->address (),
6212                   length * sizeof (tree));
6213         }
6214       return true;
6215     }
6216   if (length == 3 && vf > 2)
6217     {
6218       unsigned int k = 0, l = 0;
6219
6220       /* Generating permutation constant to get all elements in rigth order.
6221          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6222       for (i = 0; i < nelt; i++)
6223         {
6224           if (3 * k + (l % 3) >= nelt)
6225             {
6226               k = 0;
6227               l += (3 - (nelt % 3));
6228             }
6229           sel[i] = 3 * k + (l % 3);
6230           k++;
6231         }
6232       vec_perm_indices indices (sel, 2, nelt);
6233       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6234         {
6235           if (dump_enabled_p ())
6236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6237                              "shuffle of 3 fields structure is not \
6238                               supported by target\n");
6239           return false;
6240         }
6241       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6242
6243       /* Generating permutation constant to shift all elements.
6244          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6245       for (i = 0; i < nelt; i++)
6246         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6247       indices.new_vector (sel, 2, nelt);
6248       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6249         {
6250           if (dump_enabled_p ())
6251             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6252                              "shift permutation is not supported by target\n");
6253           return false;
6254         }
6255       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6256
6257       /* Generating permutation constant to shift all elements.
6258          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6259       for (i = 0; i < nelt; i++)
6260         sel[i] = 2 * (nelt / 3) + 1 + i;
6261       indices.new_vector (sel, 2, nelt);
6262       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6263         {
6264           if (dump_enabled_p ())
6265             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6266                              "shift permutation is not supported by target\n");
6267           return false;
6268         }
6269       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6270
6271       /* Generating permutation constant to shift all elements.
6272          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6273       for (i = 0; i < nelt; i++)
6274         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6275       indices.new_vector (sel, 2, nelt);
6276       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6277         {
6278           if (dump_enabled_p ())
6279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280                              "shift permutation is not supported by target\n");
6281           return false;
6282         }
6283       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6284
6285       /* Generating permutation constant to shift all elements.
6286          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6287       for (i = 0; i < nelt; i++)
6288         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6289       indices.new_vector (sel, 2, nelt);
6290       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6291         {
6292           if (dump_enabled_p ())
6293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6294                              "shift permutation is not supported by target\n");
6295           return false;
6296         }
6297       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6298
6299       for (k = 0; k < 3; k++)
6300         {
6301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303                                            dr_chain[k], dr_chain[k],
6304                                            perm3_mask);
6305           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6306           vect[k] = data_ref;
6307         }
6308
6309       for (k = 0; k < 3; k++)
6310         {
6311           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6312           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6313                                            vect[k % 3], vect[(k + 1) % 3],
6314                                            shift1_mask);
6315           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6316           vect_shift[k] = data_ref;
6317         }
6318
6319       for (k = 0; k < 3; k++)
6320         {
6321           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6322           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6323                                            vect_shift[(4 - k) % 3],
6324                                            vect_shift[(3 - k) % 3],
6325                                            shift2_mask);
6326           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6327           vect[k] = data_ref;
6328         }
6329
6330       (*result_chain)[3 - (nelt % 3)] = vect[2];
6331
6332       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6333       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6334                                        vect[0], shift3_mask);
6335       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6336       (*result_chain)[nelt % 3] = data_ref;
6337
6338       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6339       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6340                                        vect[1], shift4_mask);
6341       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6342       (*result_chain)[0] = data_ref;
6343       return true;
6344     }
6345   return false;
6346 }
6347
6348 /* Function vect_transform_grouped_load.
6349
6350    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6351    to perform their permutation and ascribe the result vectorized statements to
6352    the scalar statements.
6353 */
6354
6355 void
6356 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6357                              vec<tree> dr_chain,
6358                              int size, gimple_stmt_iterator *gsi)
6359 {
6360   machine_mode mode;
6361   vec<tree> result_chain = vNULL;
6362
6363   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6364      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6365      vectors, that are ready for vector computation.  */
6366   result_chain.create (size);
6367
6368   /* If reassociation width for vector type is 2 or greater target machine can
6369      execute 2 or more vector instructions in parallel.  Otherwise try to
6370      get chain for loads group using vect_shift_permute_load_chain.  */
6371   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6372   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6373       || pow2p_hwi (size)
6374       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6375                                          gsi, &result_chain))
6376     vect_permute_load_chain (vinfo, dr_chain,
6377                              size, stmt_info, gsi, &result_chain);
6378   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6379   result_chain.release ();
6380 }
6381
6382 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6383    generated as part of the vectorization of STMT_INFO.  Assign the statement
6384    for each vector to the associated scalar statement.  */
6385
6386 void
6387 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6388                                   vec<tree> result_chain)
6389 {
6390   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6391   unsigned int i, gap_count;
6392   tree tmp_data_ref;
6393
6394   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6395      Since we scan the chain starting from it's first node, their order
6396      corresponds the order of data-refs in RESULT_CHAIN.  */
6397   stmt_vec_info next_stmt_info = first_stmt_info;
6398   gap_count = 1;
6399   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6400     {
6401       if (!next_stmt_info)
6402         break;
6403
6404       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6405        code elimination pass later.  No need to check for the first stmt in
6406        the group, since it always exists.
6407        DR_GROUP_GAP is the number of steps in elements from the previous
6408        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6409        correspond to the gaps.  */
6410       if (next_stmt_info != first_stmt_info
6411           && gap_count < DR_GROUP_GAP (next_stmt_info))
6412         {
6413           gap_count++;
6414           continue;
6415         }
6416
6417       /* ???  The following needs cleanup after the removal of
6418          DR_GROUP_SAME_DR_STMT.  */
6419       if (next_stmt_info)
6420         {
6421           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6422           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6423              copies, and we put the new vector statement last.  */
6424           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6425
6426           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6427           gap_count = 1;
6428         }
6429     }
6430 }
6431
6432 /* Function vect_force_dr_alignment_p.
6433
6434    Returns whether the alignment of a DECL can be forced to be aligned
6435    on ALIGNMENT bit boundary.  */
6436
6437 bool
6438 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6439 {
6440   if (!VAR_P (decl))
6441     return false;
6442
6443   if (decl_in_symtab_p (decl)
6444       && !symtab_node::get (decl)->can_increase_alignment_p ())
6445     return false;
6446
6447   if (TREE_STATIC (decl))
6448     return (known_le (alignment,
6449                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6450   else
6451     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6452 }
6453
6454
6455 /* Return whether the data reference DR_INFO is supported with respect to its
6456    alignment.
6457    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6458    it is aligned, i.e., check if it is possible to vectorize it with different
6459    alignment.  */
6460
6461 enum dr_alignment_support
6462 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6463                                bool check_aligned_accesses)
6464 {
6465   data_reference *dr = dr_info->dr;
6466   stmt_vec_info stmt_info = dr_info->stmt;
6467   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468   machine_mode mode = TYPE_MODE (vectype);
6469   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6470   class loop *vect_loop = NULL;
6471   bool nested_in_vect_loop = false;
6472
6473   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6474     return dr_aligned;
6475
6476   /* For now assume all conditional loads/stores support unaligned
6477      access without any special code.  */
6478   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6479     if (gimple_call_internal_p (stmt)
6480         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6481             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6482       return dr_unaligned_supported;
6483
6484   if (loop_vinfo)
6485     {
6486       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6487       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6488     }
6489
6490   /* Possibly unaligned access.  */
6491
6492   /* We can choose between using the implicit realignment scheme (generating
6493      a misaligned_move stmt) and the explicit realignment scheme (generating
6494      aligned loads with a REALIGN_LOAD).  There are two variants to the
6495      explicit realignment scheme: optimized, and unoptimized.
6496      We can optimize the realignment only if the step between consecutive
6497      vector loads is equal to the vector size.  Since the vector memory
6498      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6499      is guaranteed that the misalignment amount remains the same throughout the
6500      execution of the vectorized loop.  Therefore, we can create the
6501      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6502      at the loop preheader.
6503
6504      However, in the case of outer-loop vectorization, when vectorizing a
6505      memory access in the inner-loop nested within the LOOP that is now being
6506      vectorized, while it is guaranteed that the misalignment of the
6507      vectorized memory access will remain the same in different outer-loop
6508      iterations, it is *not* guaranteed that is will remain the same throughout
6509      the execution of the inner-loop.  This is because the inner-loop advances
6510      with the original scalar step (and not in steps of VS).  If the inner-loop
6511      step happens to be a multiple of VS, then the misalignment remains fixed
6512      and we can use the optimized realignment scheme.  For example:
6513
6514       for (i=0; i<N; i++)
6515         for (j=0; j<M; j++)
6516           s += a[i+j];
6517
6518      When vectorizing the i-loop in the above example, the step between
6519      consecutive vector loads is 1, and so the misalignment does not remain
6520      fixed across the execution of the inner-loop, and the realignment cannot
6521      be optimized (as illustrated in the following pseudo vectorized loop):
6522
6523       for (i=0; i<N; i+=4)
6524         for (j=0; j<M; j++){
6525           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6526                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6527                          // (assuming that we start from an aligned address).
6528           }
6529
6530      We therefore have to use the unoptimized realignment scheme:
6531
6532       for (i=0; i<N; i+=4)
6533           for (j=k; j<M; j+=4)
6534           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6535                            // that the misalignment of the initial address is
6536                            // 0).
6537
6538      The loop can then be vectorized as follows:
6539
6540       for (k=0; k<4; k++){
6541         rt = get_realignment_token (&vp[k]);
6542         for (i=0; i<N; i+=4){
6543           v1 = vp[i+k];
6544           for (j=k; j<M; j+=4){
6545             v2 = vp[i+j+VS-1];
6546             va = REALIGN_LOAD <v1,v2,rt>;
6547             vs += va;
6548             v1 = v2;
6549           }
6550         }
6551     } */
6552
6553   if (DR_IS_READ (dr))
6554     {
6555       bool is_packed = false;
6556       tree type = (TREE_TYPE (DR_REF (dr)));
6557
6558       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6559           && (!targetm.vectorize.builtin_mask_for_load
6560               || targetm.vectorize.builtin_mask_for_load ()))
6561         {
6562           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6563
6564           /* If we are doing SLP then the accesses need not have the
6565              same alignment, instead it depends on the SLP group size.  */
6566           if (loop_vinfo
6567               && STMT_SLP_TYPE (stmt_info)
6568               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569                               * (DR_GROUP_SIZE
6570                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6571                               TYPE_VECTOR_SUBPARTS (vectype)))
6572             ;
6573           else if (!loop_vinfo
6574                    || (nested_in_vect_loop
6575                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6576                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6577             return dr_explicit_realign;
6578           else
6579             return dr_explicit_realign_optimized;
6580         }
6581       if (!known_alignment_for_access_p (dr_info))
6582         is_packed = not_size_aligned (DR_REF (dr));
6583
6584       if (targetm.vectorize.support_vector_misalignment
6585             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6586         /* Can't software pipeline the loads, but can at least do them.  */
6587         return dr_unaligned_supported;
6588     }
6589   else
6590     {
6591       bool is_packed = false;
6592       tree type = (TREE_TYPE (DR_REF (dr)));
6593
6594       if (!known_alignment_for_access_p (dr_info))
6595         is_packed = not_size_aligned (DR_REF (dr));
6596
6597      if (targetm.vectorize.support_vector_misalignment
6598            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599        return dr_unaligned_supported;
6600     }
6601
6602   /* Unsupported.  */
6603   return dr_unaligned_unsupported;
6604 }