gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static opt_result
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return opt_result::failure_at (vect_location,
 166                                    "will not create alias checks, as"
 167                                    " --param vect-max-version-for-alias-checks"
 168                                    " == 0\n");
 169
 170   opt_result res
 171     = runtime_alias_check_p (ddr, loop,
 172                              optimize_loop_nest_for_speed_p (loop));
 173   if (!res)
 174     return res;
 175
 176   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 177   return opt_result::success ();
 178 }
 179
 180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 181
 182 static void
 183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 184 {
 185   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 186   for (unsigned int i = 0; i < checks.length(); ++i)
 187     if (checks[i] == value)
 188       return;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "need run-time check that %T is nonzero\n",
 193                      value);
 194   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 195 }
 196
 197 /* Return true if we know that the order of vectorized DR_INFO_A and
 198    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 199    DR_INFO_B.  At least one of the accesses is a write.  */
 200
 201 static bool
 202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 203 {
 204   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 205   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 206
 207   /* Single statements are always kept in their original order.  */
 208   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 209       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 210     return true;
 211
 212   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 213      group are emitted at the position of the first scalar load and all
 214      stores in a group are emitted at the position of the last scalar store.
 215      Thus writes will happen no earlier than their current position
 216      (but could happen later) while reads will happen no later than their
 217      current position (but could happen earlier).  Reordering is therefore
 218      only possible if the first access is a write.  */
 219   stmtinfo_a = vect_orig_stmt (stmtinfo_a);
 220   stmtinfo_b = vect_orig_stmt (stmtinfo_b);
 221   stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 222   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
 223 }
 224
 225 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 226    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 227    distances.  These distances are conservatively correct but they don't
 228    reflect a guaranteed dependence.
 229
 230    Return true if this function does all the work necessary to avoid
 231    an alias or false if the caller should use the dependence distances
 232    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 233    the depth of the loop described by LOOP_VINFO and the other arguments
 234    are as for vect_analyze_data_ref_dependence.  */
 235
 236 static bool
 237 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 238                                        loop_vec_info loop_vinfo,
 239                                        int loop_depth, unsigned int *max_vf)
 240 {
 241   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 242   lambda_vector dist_v;
 243   unsigned int i;
 244   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 245     {
 246       int dist = dist_v[loop_depth];
 247       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 248         {
 249           /* If the user asserted safelen >= DIST consecutive iterations
 250              can be executed concurrently, assume independence.
 251
 252              ??? An alternative would be to add the alias check even
 253              in this case, and vectorize the fallback loop with the
 254              maximum VF set to safelen.  However, if the user has
 255              explicitly given a length, it's less likely that that
 256              would be a win.  */
 257           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 258             {
 259               if ((unsigned int) loop->safelen < *max_vf)
 260                 *max_vf = loop->safelen;
 261               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 262               continue;
 263             }
 264
 265           /* For dependence distances of 2 or more, we have the option
 266              of limiting VF or checking for an alias at runtime.
 267              Prefer to check at runtime if we can, to avoid limiting
 268              the VF unnecessarily when the bases are in fact independent.
 269
 270              Note that the alias checks will be removed if the VF ends up
 271              being small enough.  */
 272           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 273           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 274           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 275                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 276                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 277         }
 278     }
 279   return true;
 280 }
 281
 282
 283 /* Function vect_analyze_data_ref_dependence.
 284
 285    FIXME: I needed to change the sense of the returned flag.
 286
 287    Return FALSE if there (might) exist a dependence between a memory-reference
 288    DRA and a memory-reference DRB.  When versioning for alias may check a
 289    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 290    the data dependence.  */
 291
 292 static opt_result
 293 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 294                                   loop_vec_info loop_vinfo,
 295                                   unsigned int *max_vf)
 296 {
 297   unsigned int i;
 298   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 299   struct data_reference *dra = DDR_A (ddr);
 300   struct data_reference *drb = DDR_B (ddr);
 301   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 302   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 303   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 304   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 305   lambda_vector dist_v;
 306   unsigned int loop_depth;
 307
 308   /* In loop analysis all data references should be vectorizable.  */
 309   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 310       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 311     gcc_unreachable ();
 312
 313   /* Independent data accesses.  */
 314   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 315     return opt_result::success ();
 316
 317   if (dra == drb
 318       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 319     return opt_result::success ();
 320
 321   /* We do not have to consider dependences between accesses that belong
 322      to the same group, unless the stride could be smaller than the
 323      group size.  */
 324   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 325       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 326           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 327       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 328     return opt_result::success ();
 329
 330   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 331      least two scalar iterations, there is always also a true dependence.
 332      As the vectorizer does not re-order loads and stores we can ignore
 333      the anti-dependence if TBAA can disambiguate both DRs similar to the
 334      case with known negative distance anti-dependences (positive
 335      distance anti-dependences would violate TBAA constraints).  */
 336   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 337        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 338       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 339                                  get_alias_set (DR_REF (drb))))
 340     return opt_result::success ();
 341
 342   /* Unknown data dependence.  */
 343   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 344     {
 345       /* If user asserted safelen consecutive iterations can be
 346          executed concurrently, assume independence.  */
 347       if (loop->safelen >= 2)
 348         {
 349           if ((unsigned int) loop->safelen < *max_vf)
 350             *max_vf = loop->safelen;
 351           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 352           return opt_result::success ();
 353         }
 354
 355       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 356           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 357         return opt_result::failure_at
 358           (stmtinfo_a->stmt,
 359            "versioning for alias not supported for: "
 360            "can't determine dependence between %T and %T\n",
 361            DR_REF (dra), DR_REF (drb));
 362
 363       if (dump_enabled_p ())
 364         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 365                          "versioning for alias required: "
 366                          "can't determine dependence between %T and %T\n",
 367                          DR_REF (dra), DR_REF (drb));
 368
 369       /* Add to list of ddrs that need to be tested at run-time.  */
 370       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 371     }
 372
 373   /* Known data dependence.  */
 374   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 375     {
 376       /* If user asserted safelen consecutive iterations can be
 377          executed concurrently, assume independence.  */
 378       if (loop->safelen >= 2)
 379         {
 380           if ((unsigned int) loop->safelen < *max_vf)
 381             *max_vf = loop->safelen;
 382           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 383           return opt_result::success ();
 384         }
 385
 386       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 387           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 388         return opt_result::failure_at
 389           (stmtinfo_a->stmt,
 390            "versioning for alias not supported for: "
 391            "bad dist vector for %T and %T\n",
 392            DR_REF (dra), DR_REF (drb));
 393
 394       if (dump_enabled_p ())
 395         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 396                          "versioning for alias required: "
 397                          "bad dist vector for %T and %T\n",
 398                          DR_REF (dra), DR_REF (drb));
 399       /* Add to list of ddrs that need to be tested at run-time.  */
 400       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 401     }
 402
 403   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 404
 405   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 406       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 407                                                 loop_depth, max_vf))
 408     return opt_result::success ();
 409
 410   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 411     {
 412       int dist = dist_v[loop_depth];
 413
 414       if (dump_enabled_p ())
 415         dump_printf_loc (MSG_NOTE, vect_location,
 416                          "dependence distance  = %d.\n", dist);
 417
 418       if (dist == 0)
 419         {
 420           if (dump_enabled_p ())
 421             dump_printf_loc (MSG_NOTE, vect_location,
 422                              "dependence distance == 0 between %T and %T\n",
 423                              DR_REF (dra), DR_REF (drb));
 424
 425           /* When we perform grouped accesses and perform implicit CSE
 426              by detecting equal accesses and doing disambiguation with
 427              runtime alias tests like for
 428                 .. = a[i];
 429                 .. = a[i+1];
 430                 a[i] = ..;
 431                 a[i+1] = ..;
 432                 *p = ..;
 433                 .. = a[i];
 434                 .. = a[i+1];
 435              where we will end up loading { a[i], a[i+1] } once, make
 436              sure that inserting group loads before the first load and
 437              stores after the last store will do the right thing.
 438              Similar for groups like
 439                 a[i] = ...;
 440                 ... = a[i];
 441                 a[i+1] = ...;
 442              where loads from the group interleave with the store.  */
 443           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 444             return opt_result::failure_at (stmtinfo_a->stmt,
 445                                            "READ_WRITE dependence"
 446                                            " in interleaving.\n");
 447
 448           if (loop->safelen < 2)
 449             {
 450               tree indicator = dr_zero_step_indicator (dra);
 451               if (!indicator || integer_zerop (indicator))
 452                 return opt_result::failure_at (stmtinfo_a->stmt,
 453                                                "access also has a zero step\n");
 454               else if (TREE_CODE (indicator) != INTEGER_CST)
 455                 vect_check_nonzero_value (loop_vinfo, indicator);
 456             }
 457           continue;
 458         }
 459
 460       if (dist > 0 && DDR_REVERSED_P (ddr))
 461         {
 462           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 463              reversed (to make distance vector positive), and the actual
 464              distance is negative.  */
 465           if (dump_enabled_p ())
 466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 467                              "dependence distance negative.\n");
 468           /* Record a negative dependence distance to later limit the
 469              amount of stmt copying / unrolling we can perform.
 470              Only need to handle read-after-write dependence.  */
 471           if (DR_IS_READ (drb)
 472               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 473                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 474             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 475           continue;
 476         }
 477
 478       unsigned int abs_dist = abs (dist);
 479       if (abs_dist >= 2 && abs_dist < *max_vf)
 480         {
 481           /* The dependence distance requires reduction of the maximal
 482              vectorization factor.  */
 483           *max_vf = abs (dist);
 484           if (dump_enabled_p ())
 485             dump_printf_loc (MSG_NOTE, vect_location,
 486                              "adjusting maximal vectorization factor to %i\n",
 487                              *max_vf);
 488         }
 489
 490       if (abs_dist >= *max_vf)
 491         {
 492           /* Dependence distance does not create dependence, as far as
 493              vectorization is concerned, in this case.  */
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location,
 496                              "dependence distance >= VF.\n");
 497           continue;
 498         }
 499
 500       return opt_result::failure_at (stmtinfo_a->stmt,
 501                                      "not vectorized, possible dependence "
 502                                      "between data-refs %T and %T\n",
 503                                      DR_REF (dra), DR_REF (drb));
 504     }
 505
 506   return opt_result::success ();
 507 }
 508
 509 /* Function vect_analyze_data_ref_dependences.
 510
 511    Examine all the data references in the loop, and make sure there do not
 512    exist any data dependences between them.  Set *MAX_VF according to
 513    the maximum vectorization factor the data dependences allow.  */
 514
 515 opt_result
 516 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 517                                    unsigned int *max_vf)
 518 {
 519   unsigned int i;
 520   struct data_dependence_relation *ddr;
 521
 522   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 523
 524   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 525     {
 526       LOOP_VINFO_DDRS (loop_vinfo)
 527         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 528                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 529       /* We need read-read dependences to compute
 530          STMT_VINFO_SAME_ALIGN_REFS.  */
 531       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 532                                           &LOOP_VINFO_DDRS (loop_vinfo),
 533                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 534                                           true);
 535       gcc_assert (res);
 536     }
 537
 538   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 539
 540   /* For epilogues we either have no aliases or alias versioning
 541      was applied to original loop.  Therefore we may just get max_vf
 542      using VF of original loop.  */
 543   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 544     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 545   else
 546     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 547       {
 548         opt_result res
 549           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 550         if (!res)
 551           return res;
 552       }
 553
 554   return opt_result::success ();
 555 }
 556
 557
 558 /* Function vect_slp_analyze_data_ref_dependence.
 559
 560    Return TRUE if there (might) exist a dependence between a memory-reference
 561    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 562    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 563    according to the data dependence.  */
 564
 565 static bool
 566 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 567                                       struct data_dependence_relation *ddr)
 568 {
 569   struct data_reference *dra = DDR_A (ddr);
 570   struct data_reference *drb = DDR_B (ddr);
 571   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 572   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 573
 574   /* We need to check dependences of statements marked as unvectorizable
 575      as well, they still can prohibit vectorization.  */
 576
 577   /* Independent data accesses.  */
 578   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 579     return false;
 580
 581   if (dra == drb)
 582     return false;
 583
 584   /* Read-read is OK.  */
 585   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 586     return false;
 587
 588   /* If dra and drb are part of the same interleaving chain consider
 589      them independent.  */
 590   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 591       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 592           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 593     return false;
 594
 595   /* Unknown data dependence.  */
 596   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 597     {
 598       if  (dump_enabled_p ())
 599         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 600                          "can't determine dependence between %T and %T\n",
 601                          DR_REF (dra), DR_REF (drb));
 602     }
 603   else if (dump_enabled_p ())
 604     dump_printf_loc (MSG_NOTE, vect_location,
 605                      "determined dependence between %T and %T\n",
 606                      DR_REF (dra), DR_REF (drb));
 607
 608   return true;
 609 }
 610
 611
 612 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 613    contain the vector of scalar stores of this instance if we are
 614    disambiguating the loads.  */
 615
 616 static bool
 617 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 618                                    vec<stmt_vec_info> stores,
 619                                    stmt_vec_info last_store_info)
 620 {
 621   /* This walks over all stmts involved in the SLP load/store done
 622      in NODE verifying we can sink them up to the last stmt in the
 623      group.  */
 624   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 625   vec_info *vinfo = last_access_info->vinfo;
 626   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 627     {
 628       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 629       if (access_info == last_access_info)
 630         continue;
 631       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 632       ao_ref ref;
 633       bool ref_initialized_p = false;
 634       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 635            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 636         {
 637           gimple *stmt = gsi_stmt (gsi);
 638           if (! gimple_vuse (stmt)
 639               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 640             continue;
 641
 642           /* If we couldn't record a (single) data reference for this
 643              stmt we have to resort to the alias oracle.  */
 644           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 645           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 646           if (!dr_b)
 647             {
 648               /* We are moving a store or sinking a load - this means
 649                  we cannot use TBAA for disambiguation.  */
 650               if (!ref_initialized_p)
 651                 ao_ref_init (&ref, DR_REF (dr_a));
 652               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 653                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 654                 return false;
 655               continue;
 656             }
 657
 658           bool dependent = false;
 659           /* If we run into a store of this same instance (we've just
 660              marked those) then delay dependence checking until we run
 661              into the last store because this is where it will have
 662              been sunk to (and we verify if we can do that as well).  */
 663           if (gimple_visited_p (stmt))
 664             {
 665               if (stmt_info != last_store_info)
 666                 continue;
 667               unsigned i;
 668               stmt_vec_info store_info;
 669               FOR_EACH_VEC_ELT (stores, i, store_info)
 670                 {
 671                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 672                   ddr_p ddr = initialize_data_dependence_relation
 673                                 (dr_a, store_dr, vNULL);
 674                   dependent
 675                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 676                   free_dependence_relation (ddr);
 677                   if (dependent)
 678                     break;
 679                 }
 680             }
 681           else
 682             {
 683               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 684                                                                dr_b, vNULL);
 685               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 686               free_dependence_relation (ddr);
 687             }
 688           if (dependent)
 689             return false;
 690         }
 691     }
 692   return true;
 693 }
 694
 695
 696 /* Function vect_analyze_data_ref_dependences.
 697
 698    Examine all the data references in the basic-block, and make sure there
 699    do not exist any data dependences between them.  Set *MAX_VF according to
 700    the maximum vectorization factor the data dependences allow.  */
 701
 702 bool
 703 vect_slp_analyze_instance_dependence (slp_instance instance)
 704 {
 705   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 706
 707   /* The stores of this instance are at the root of the SLP tree.  */
 708   slp_tree store = SLP_INSTANCE_TREE (instance);
 709   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 710     store = NULL;
 711
 712   /* Verify we can sink stores to the vectorized stmt insert location.  */
 713   stmt_vec_info last_store_info = NULL;
 714   if (store)
 715     {
 716       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 717         return false;
 718
 719       /* Mark stores in this instance and remember the last one.  */
 720       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 721       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 722         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 723     }
 724
 725   bool res = true;
 726
 727   /* Verify we can sink loads to the vectorized stmt insert location,
 728      special-casing stores of this instance.  */
 729   slp_tree load;
 730   unsigned int i;
 731   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 732     if (! vect_slp_analyze_node_dependences (instance, load,
 733                                              store
 734                                              ? SLP_TREE_SCALAR_STMTS (store)
 735                                              : vNULL, last_store_info))
 736       {
 737         res = false;
 738         break;
 739       }
 740
 741   /* Unset the visited flag.  */
 742   if (store)
 743     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 744       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 745
 746   return res;
 747 }
 748
 749 /* Record the base alignment guarantee given by DRB, which occurs
 750    in STMT_INFO.  */
 751
 752 static void
 753 vect_record_base_alignment (stmt_vec_info stmt_info,
 754                             innermost_loop_behavior *drb)
 755 {
 756   vec_info *vinfo = stmt_info->vinfo;
 757   bool existed;
 758   innermost_loop_behavior *&entry
 759     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 760   if (!existed || entry->base_alignment < drb->base_alignment)
 761     {
 762       entry = drb;
 763       if (dump_enabled_p ())
 764         dump_printf_loc (MSG_NOTE, vect_location,
 765                          "recording new base alignment for %T\n"
 766                          "  alignment:    %d\n"
 767                          "  misalignment: %d\n"
 768                          "  based on:     %G",
 769                          drb->base_address,
 770                          drb->base_alignment,
 771                          drb->base_misalignment,
 772                          stmt_info->stmt);
 773     }
 774 }
 775
 776 /* If the region we're going to vectorize is reached, all unconditional
 777    data references occur at least once.  We can therefore pool the base
 778    alignment guarantees from each unconditional reference.  Do this by
 779    going through all the data references in VINFO and checking whether
 780    the containing statement makes the reference unconditionally.  If so,
 781    record the alignment of the base address in VINFO so that it can be
 782    used for all other references with the same base.  */
 783
 784 void
 785 vect_record_base_alignments (vec_info *vinfo)
 786 {
 787   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 788   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 789   data_reference *dr;
 790   unsigned int i;
 791   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 792     {
 793       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 794       stmt_vec_info stmt_info = dr_info->stmt;
 795       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 796           && STMT_VINFO_VECTORIZABLE (stmt_info)
 797           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 798         {
 799           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 800
 801           /* If DR is nested in the loop that is being vectorized, we can also
 802              record the alignment of the base wrt the outer loop.  */
 803           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 804             vect_record_base_alignment
 805               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 806         }
 807     }
 808 }
 809
 810 /* Return the target alignment for the vectorized form of DR_INFO.  */
 811
 812 static unsigned int
 813 vect_calculate_target_alignment (dr_vec_info *dr_info)
 814 {
 815   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 816   return targetm.vectorize.preferred_vector_alignment (vectype);
 817 }
 818
 819 /* Function vect_compute_data_ref_alignment
 820
 821    Compute the misalignment of the data reference DR_INFO.
 822
 823    Output:
 824    1. DR_MISALIGNMENT (DR_INFO) is defined.
 825
 826    FOR NOW: No analysis is actually performed. Misalignment is calculated
 827    only for trivial cases. TODO.  */
 828
 829 static void
 830 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 831 {
 832   stmt_vec_info stmt_info = dr_info->stmt;
 833   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 834   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 835   struct loop *loop = NULL;
 836   tree ref = DR_REF (dr_info->dr);
 837   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 838
 839   if (dump_enabled_p ())
 840     dump_printf_loc (MSG_NOTE, vect_location,
 841                      "vect_compute_data_ref_alignment:\n");
 842
 843   if (loop_vinfo)
 844     loop = LOOP_VINFO_LOOP (loop_vinfo);
 845
 846   /* Initialize misalignment to unknown.  */
 847   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 848
 849   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 850     return;
 851
 852   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 853   bool step_preserves_misalignment_p;
 854
 855   unsigned HOST_WIDE_INT vector_alignment
 856     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 857   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 858
 859   /* No step for BB vectorization.  */
 860   if (!loop)
 861     {
 862       gcc_assert (integer_zerop (drb->step));
 863       step_preserves_misalignment_p = true;
 864     }
 865
 866   /* In case the dataref is in an inner-loop of the loop that is being
 867      vectorized (LOOP), we use the base and misalignment information
 868      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 869      stays the same throughout the execution of the inner-loop, which is why
 870      we have to check that the stride of the dataref in the inner-loop evenly
 871      divides by the vector alignment.  */
 872   else if (nested_in_vect_loop_p (loop, stmt_info))
 873     {
 874       step_preserves_misalignment_p
 875         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 876
 877       if (dump_enabled_p ())
 878         {
 879           if (step_preserves_misalignment_p)
 880             dump_printf_loc (MSG_NOTE, vect_location,
 881                              "inner step divides the vector alignment.\n");
 882           else
 883             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 884                              "inner step doesn't divide the vector"
 885                              " alignment.\n");
 886         }
 887     }
 888
 889   /* Similarly we can only use base and misalignment information relative to
 890      an innermost loop if the misalignment stays the same throughout the
 891      execution of the loop.  As above, this is the case if the stride of
 892      the dataref evenly divides by the alignment.  */
 893   else
 894     {
 895       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 896       step_preserves_misalignment_p
 897         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 898
 899       if (!step_preserves_misalignment_p && dump_enabled_p ())
 900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 901                          "step doesn't divide the vector alignment.\n");
 902     }
 903
 904   unsigned int base_alignment = drb->base_alignment;
 905   unsigned int base_misalignment = drb->base_misalignment;
 906
 907   /* Calculate the maximum of the pooled base address alignment and the
 908      alignment that we can compute for DR itself.  */
 909   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 910   if (entry && base_alignment < (*entry)->base_alignment)
 911     {
 912       base_alignment = (*entry)->base_alignment;
 913       base_misalignment = (*entry)->base_misalignment;
 914     }
 915
 916   if (drb->offset_alignment < vector_alignment
 917       || !step_preserves_misalignment_p
 918       /* We need to know whether the step wrt the vectorized loop is
 919          negative when computing the starting misalignment below.  */
 920       || TREE_CODE (drb->step) != INTEGER_CST)
 921     {
 922       if (dump_enabled_p ())
 923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 924                          "Unknown alignment for access: %T\n", ref);
 925       return;
 926     }
 927
 928   if (base_alignment < vector_alignment)
 929     {
 930       unsigned int max_alignment;
 931       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 932       if (max_alignment < vector_alignment
 933           || !vect_can_force_dr_alignment_p (base,
 934                                              vector_alignment * BITS_PER_UNIT))
 935         {
 936           if (dump_enabled_p ())
 937             dump_printf_loc (MSG_NOTE, vect_location,
 938                              "can't force alignment of ref: %T\n", ref);
 939           return;
 940         }
 941
 942       /* Force the alignment of the decl.
 943          NOTE: This is the only change to the code we make during
 944          the analysis phase, before deciding to vectorize the loop.  */
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "force alignment of %T\n", ref);
 948
 949       dr_info->base_decl = base;
 950       dr_info->base_misaligned = true;
 951       base_misalignment = 0;
 952     }
 953   poly_int64 misalignment
 954     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 955
 956   /* If this is a backward running DR then first access in the larger
 957      vectype actually is N-1 elements before the address in the DR.
 958      Adjust misalign accordingly.  */
 959   if (tree_int_cst_sgn (drb->step) < 0)
 960     /* PLUS because STEP is negative.  */
 961     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 962                      * TREE_INT_CST_LOW (drb->step));
 963
 964   unsigned int const_misalignment;
 965   if (!known_misalignment (misalignment, vector_alignment,
 966                            &const_misalignment))
 967     {
 968       if (dump_enabled_p ())
 969         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                          "Non-constant misalignment for access: %T\n", ref);
 971       return;
 972     }
 973
 974   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
 975
 976   if (dump_enabled_p ())
 977     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 978                      "misalign = %d bytes of ref %T\n",
 979                      DR_MISALIGNMENT (dr_info), ref);
 980
 981   return;
 982 }
 983
 984 /* Function vect_update_misalignment_for_peel.
 985    Sets DR_INFO's misalignment
 986    - to 0 if it has the same alignment as DR_PEEL_INFO,
 987    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
 988    - to -1 (unknown) otherwise.
 989
 990    DR_INFO - the data reference whose misalignment is to be adjusted.
 991    DR_PEEL_INFO - the data reference whose misalignment is being made
 992                   zero in the vector loop by the peel.
 993    NPEEL - the number of iterations in the peel loop if the misalignment
 994            of DR_PEEL_INFO is known at compile time.  */
 995
 996 static void
 997 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
 998                                    dr_vec_info *dr_peel_info, int npeel)
 999 {
1000   unsigned int i;
1001   vec<dr_p> same_aligned_drs;
1002   struct data_reference *current_dr;
1003   int dr_size = vect_get_scalar_dr_size (dr_info);
1004   int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1005   stmt_vec_info stmt_info = dr_info->stmt;
1006   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1007
1008  /* For interleaved data accesses the step in the loop must be multiplied by
1009      the size of the interleaving group.  */
1010   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1011     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1012   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1013     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1014
1015   /* It can be assumed that the data refs with the same alignment as dr_peel
1016      are aligned in the vector loop.  */
1017   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1018   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1019     {
1020       if (current_dr != dr_info->dr)
1021         continue;
1022       gcc_assert (!known_alignment_for_access_p (dr_info)
1023                   || !known_alignment_for_access_p (dr_peel_info)
1024                   || (DR_MISALIGNMENT (dr_info) / dr_size
1025                       == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1026       SET_DR_MISALIGNMENT (dr_info, 0);
1027       return;
1028     }
1029
1030   if (known_alignment_for_access_p (dr_info)
1031       && known_alignment_for_access_p (dr_peel_info))
1032     {
1033       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1034                                             size_zero_node) < 0;
1035       int misal = DR_MISALIGNMENT (dr_info);
1036       misal += negative ? -npeel * dr_size : npeel * dr_size;
1037       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1038       SET_DR_MISALIGNMENT (dr_info, misal);
1039       return;
1040     }
1041
1042   if (dump_enabled_p ())
1043     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1044                      "to unknown (-1).\n");
1045   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1046 }
1047
1048
1049 /* Function verify_data_ref_alignment
1050
1051    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1052
1053 static opt_result
1054 verify_data_ref_alignment (dr_vec_info *dr_info)
1055 {
1056   enum dr_alignment_support supportable_dr_alignment
1057     = vect_supportable_dr_alignment (dr_info, false);
1058   if (!supportable_dr_alignment)
1059     return opt_result::failure_at
1060       (dr_info->stmt->stmt,
1061        DR_IS_READ (dr_info->dr)
1062         ? "not vectorized: unsupported unaligned load: %T\n"
1063         : "not vectorized: unsupported unaligned store: %T\n",
1064        DR_REF (dr_info->dr));
1065
1066   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1067     dump_printf_loc (MSG_NOTE, vect_location,
1068                      "Vectorizing an unaligned access.\n");
1069
1070   return opt_result::success ();
1071 }
1072
1073 /* Function vect_verify_datarefs_alignment
1074
1075    Return TRUE if all data references in the loop can be
1076    handled with respect to alignment.  */
1077
1078 opt_result
1079 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1080 {
1081   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1082   struct data_reference *dr;
1083   unsigned int i;
1084
1085   FOR_EACH_VEC_ELT (datarefs, i, dr)
1086     {
1087       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1088       stmt_vec_info stmt_info = dr_info->stmt;
1089
1090       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1091         continue;
1092
1093       /* For interleaving, only the alignment of the first access matters.   */
1094       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1095           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1096         continue;
1097
1098       /* Strided accesses perform only component accesses, alignment is
1099          irrelevant for them.  */
1100       if (STMT_VINFO_STRIDED_P (stmt_info)
1101           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1102         continue;
1103
1104       opt_result res = verify_data_ref_alignment (dr_info);
1105       if (!res)
1106         return res;
1107     }
1108
1109   return opt_result::success ();
1110 }
1111
1112 /* Given an memory reference EXP return whether its alignment is less
1113    than its size.  */
1114
1115 static bool
1116 not_size_aligned (tree exp)
1117 {
1118   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1119     return true;
1120
1121   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1122           > get_object_alignment (exp));
1123 }
1124
1125 /* Function vector_alignment_reachable_p
1126
1127    Return true if vector alignment for DR_INFO is reachable by peeling
1128    a few loop iterations.  Return false otherwise.  */
1129
1130 static bool
1131 vector_alignment_reachable_p (dr_vec_info *dr_info)
1132 {
1133   stmt_vec_info stmt_info = dr_info->stmt;
1134   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1135
1136   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1137     {
1138       /* For interleaved access we peel only if number of iterations in
1139          the prolog loop ({VF - misalignment}), is a multiple of the
1140          number of the interleaved accesses.  */
1141       int elem_size, mis_in_elements;
1142
1143       /* FORNOW: handle only known alignment.  */
1144       if (!known_alignment_for_access_p (dr_info))
1145         return false;
1146
1147       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1148       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1149       elem_size = vector_element_size (vector_size, nelements);
1150       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1151
1152       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1153         return false;
1154     }
1155
1156   /* If misalignment is known at the compile time then allow peeling
1157      only if natural alignment is reachable through peeling.  */
1158   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1159     {
1160       HOST_WIDE_INT elmsize =
1161                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1162       if (dump_enabled_p ())
1163         {
1164           dump_printf_loc (MSG_NOTE, vect_location,
1165                            "data size = %wd. misalignment = %d.\n", elmsize,
1166                            DR_MISALIGNMENT (dr_info));
1167         }
1168       if (DR_MISALIGNMENT (dr_info) % elmsize)
1169         {
1170           if (dump_enabled_p ())
1171             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1172                              "data size does not divide the misalignment.\n");
1173           return false;
1174         }
1175     }
1176
1177   if (!known_alignment_for_access_p (dr_info))
1178     {
1179       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1180       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1181       if (dump_enabled_p ())
1182         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1183                          "Unknown misalignment, %snaturally aligned\n",
1184                          is_packed ? "not " : "");
1185       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1186     }
1187
1188   return true;
1189 }
1190
1191
1192 /* Calculate the cost of the memory access represented by DR_INFO.  */
1193
1194 static void
1195 vect_get_data_access_cost (dr_vec_info *dr_info,
1196                            unsigned int *inside_cost,
1197                            unsigned int *outside_cost,
1198                            stmt_vector_for_cost *body_cost_vec,
1199                            stmt_vector_for_cost *prologue_cost_vec)
1200 {
1201   stmt_vec_info stmt_info = dr_info->stmt;
1202   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1203   int ncopies;
1204
1205   if (PURE_SLP_STMT (stmt_info))
1206     ncopies = 1;
1207   else
1208     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1209
1210   if (DR_IS_READ (dr_info->dr))
1211     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1212                         prologue_cost_vec, body_cost_vec, false);
1213   else
1214     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1215
1216   if (dump_enabled_p ())
1217     dump_printf_loc (MSG_NOTE, vect_location,
1218                      "vect_get_data_access_cost: inside_cost = %d, "
1219                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1220 }
1221
1222
1223 typedef struct _vect_peel_info
1224 {
1225   dr_vec_info *dr_info;
1226   int npeel;
1227   unsigned int count;
1228 } *vect_peel_info;
1229
1230 typedef struct _vect_peel_extended_info
1231 {
1232   struct _vect_peel_info peel_info;
1233   unsigned int inside_cost;
1234   unsigned int outside_cost;
1235 } *vect_peel_extended_info;
1236
1237
1238 /* Peeling hashtable helpers.  */
1239
1240 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1241 {
1242   static inline hashval_t hash (const _vect_peel_info *);
1243   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1244 };
1245
1246 inline hashval_t
1247 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1248 {
1249   return (hashval_t) peel_info->npeel;
1250 }
1251
1252 inline bool
1253 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1254 {
1255   return (a->npeel == b->npeel);
1256 }
1257
1258
1259 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1260
1261 static void
1262 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1263                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1264                           int npeel)
1265 {
1266   struct _vect_peel_info elem, *slot;
1267   _vect_peel_info **new_slot;
1268   bool supportable_dr_alignment
1269     = vect_supportable_dr_alignment (dr_info, true);
1270
1271   elem.npeel = npeel;
1272   slot = peeling_htab->find (&elem);
1273   if (slot)
1274     slot->count++;
1275   else
1276     {
1277       slot = XNEW (struct _vect_peel_info);
1278       slot->npeel = npeel;
1279       slot->dr_info = dr_info;
1280       slot->count = 1;
1281       new_slot = peeling_htab->find_slot (slot, INSERT);
1282       *new_slot = slot;
1283     }
1284
1285   if (!supportable_dr_alignment
1286       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1287     slot->count += VECT_MAX_COST;
1288 }
1289
1290
1291 /* Traverse peeling hash table to find peeling option that aligns maximum
1292    number of data accesses.  */
1293
1294 int
1295 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1296                                      _vect_peel_extended_info *max)
1297 {
1298   vect_peel_info elem = *slot;
1299
1300   if (elem->count > max->peel_info.count
1301       || (elem->count == max->peel_info.count
1302           && max->peel_info.npeel > elem->npeel))
1303     {
1304       max->peel_info.npeel = elem->npeel;
1305       max->peel_info.count = elem->count;
1306       max->peel_info.dr_info = elem->dr_info;
1307     }
1308
1309   return 1;
1310 }
1311
1312 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1313    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1314    we assume DR0_INFO's misalignment will be zero after peeling.  */
1315
1316 static void
1317 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1318                                 dr_vec_info *dr0_info,
1319                                 unsigned int *inside_cost,
1320                                 unsigned int *outside_cost,
1321                                 stmt_vector_for_cost *body_cost_vec,
1322                                 stmt_vector_for_cost *prologue_cost_vec,
1323                                 unsigned int npeel,
1324                                 bool unknown_misalignment)
1325 {
1326   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1327   unsigned i;
1328   data_reference *dr;
1329
1330   FOR_EACH_VEC_ELT (datarefs, i, dr)
1331     {
1332       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1333       stmt_vec_info stmt_info = dr_info->stmt;
1334       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1335         continue;
1336
1337       /* For interleaving, only the alignment of the first access
1338          matters.  */
1339       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1341         continue;
1342
1343       /* Strided accesses perform only component accesses, alignment is
1344          irrelevant for them.  */
1345       if (STMT_VINFO_STRIDED_P (stmt_info)
1346           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1347         continue;
1348
1349       int save_misalignment;
1350       save_misalignment = DR_MISALIGNMENT (dr_info);
1351       if (npeel == 0)
1352         ;
1353       else if (unknown_misalignment && dr_info == dr0_info)
1354         SET_DR_MISALIGNMENT (dr_info, 0);
1355       else
1356         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1357       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1358                                  body_cost_vec, prologue_cost_vec);
1359       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1360     }
1361 }
1362
1363 /* Traverse peeling hash table and calculate cost for each peeling option.
1364    Find the one with the lowest cost.  */
1365
1366 int
1367 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1368                                    _vect_peel_extended_info *min)
1369 {
1370   vect_peel_info elem = *slot;
1371   int dummy;
1372   unsigned int inside_cost = 0, outside_cost = 0;
1373   stmt_vec_info stmt_info = elem->dr_info->stmt;
1374   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1375   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1376                        epilogue_cost_vec;
1377
1378   prologue_cost_vec.create (2);
1379   body_cost_vec.create (2);
1380   epilogue_cost_vec.create (2);
1381
1382   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1383                                   &outside_cost, &body_cost_vec,
1384                                   &prologue_cost_vec, elem->npeel, false);
1385
1386   body_cost_vec.release ();
1387
1388   outside_cost += vect_get_known_peeling_cost
1389     (loop_vinfo, elem->npeel, &dummy,
1390      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1391      &prologue_cost_vec, &epilogue_cost_vec);
1392
1393   /* Prologue and epilogue costs are added to the target model later.
1394      These costs depend only on the scalar iteration cost, the
1395      number of peeling iterations finally chosen, and the number of
1396      misaligned statements.  So discard the information found here.  */
1397   prologue_cost_vec.release ();
1398   epilogue_cost_vec.release ();
1399
1400   if (inside_cost < min->inside_cost
1401       || (inside_cost == min->inside_cost
1402           && outside_cost < min->outside_cost))
1403     {
1404       min->inside_cost = inside_cost;
1405       min->outside_cost = outside_cost;
1406       min->peel_info.dr_info = elem->dr_info;
1407       min->peel_info.npeel = elem->npeel;
1408       min->peel_info.count = elem->count;
1409     }
1410
1411   return 1;
1412 }
1413
1414
1415 /* Choose best peeling option by traversing peeling hash table and either
1416    choosing an option with the lowest cost (if cost model is enabled) or the
1417    option that aligns as many accesses as possible.  */
1418
1419 static struct _vect_peel_extended_info
1420 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1421                                        loop_vec_info loop_vinfo)
1422 {
1423    struct _vect_peel_extended_info res;
1424
1425    res.peel_info.dr_info = NULL;
1426
1427    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1428      {
1429        res.inside_cost = INT_MAX;
1430        res.outside_cost = INT_MAX;
1431        peeling_htab->traverse <_vect_peel_extended_info *,
1432                                vect_peeling_hash_get_lowest_cost> (&res);
1433      }
1434    else
1435      {
1436        res.peel_info.count = 0;
1437        peeling_htab->traverse <_vect_peel_extended_info *,
1438                                vect_peeling_hash_get_most_frequent> (&res);
1439        res.inside_cost = 0;
1440        res.outside_cost = 0;
1441      }
1442
1443    return res;
1444 }
1445
1446 /* Return true if the new peeling NPEEL is supported.  */
1447
1448 static bool
1449 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1450                           unsigned npeel)
1451 {
1452   unsigned i;
1453   struct data_reference *dr = NULL;
1454   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1455   enum dr_alignment_support supportable_dr_alignment;
1456
1457   /* Ensure that all data refs can be vectorized after the peel.  */
1458   FOR_EACH_VEC_ELT (datarefs, i, dr)
1459     {
1460       int save_misalignment;
1461
1462       if (dr == dr0_info->dr)
1463         continue;
1464
1465       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1466       stmt_vec_info stmt_info = dr_info->stmt;
1467       /* For interleaving, only the alignment of the first access
1468          matters.  */
1469       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1470           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1471         continue;
1472
1473       /* Strided accesses perform only component accesses, alignment is
1474          irrelevant for them.  */
1475       if (STMT_VINFO_STRIDED_P (stmt_info)
1476           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1477         continue;
1478
1479       save_misalignment = DR_MISALIGNMENT (dr_info);
1480       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1481       supportable_dr_alignment
1482         = vect_supportable_dr_alignment (dr_info, false);
1483       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1484
1485       if (!supportable_dr_alignment)
1486         return false;
1487     }
1488
1489   return true;
1490 }
1491
1492 /* Function vect_enhance_data_refs_alignment
1493
1494    This pass will use loop versioning and loop peeling in order to enhance
1495    the alignment of data references in the loop.
1496
1497    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1498    original loop is to be vectorized.  Any other loops that are created by
1499    the transformations performed in this pass - are not supposed to be
1500    vectorized.  This restriction will be relaxed.
1501
1502    This pass will require a cost model to guide it whether to apply peeling
1503    or versioning or a combination of the two.  For example, the scheme that
1504    intel uses when given a loop with several memory accesses, is as follows:
1505    choose one memory access ('p') which alignment you want to force by doing
1506    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1507    other accesses are not necessarily aligned, or (2) use loop versioning to
1508    generate one loop in which all accesses are aligned, and another loop in
1509    which only 'p' is necessarily aligned.
1510
1511    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1512    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1513    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1514
1515    Devising a cost model is the most critical aspect of this work.  It will
1516    guide us on which access to peel for, whether to use loop versioning, how
1517    many versions to create, etc.  The cost model will probably consist of
1518    generic considerations as well as target specific considerations (on
1519    powerpc for example, misaligned stores are more painful than misaligned
1520    loads).
1521
1522    Here are the general steps involved in alignment enhancements:
1523
1524      -- original loop, before alignment analysis:
1525         for (i=0; i<N; i++){
1526           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1527           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1528         }
1529
1530      -- After vect_compute_data_refs_alignment:
1531         for (i=0; i<N; i++){
1532           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1533           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1534         }
1535
1536      -- Possibility 1: we do loop versioning:
1537      if (p is aligned) {
1538         for (i=0; i<N; i++){    # loop 1A
1539           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1540           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1541         }
1542      }
1543      else {
1544         for (i=0; i<N; i++){    # loop 1B
1545           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1546           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1547         }
1548      }
1549
1550      -- Possibility 2: we do loop peeling:
1551      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1552         x = q[i];
1553         p[i] = y;
1554      }
1555      for (i = 3; i < N; i++){   # loop 2A
1556         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1557         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1558      }
1559
1560      -- Possibility 3: combination of loop peeling and versioning:
1561      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1562         x = q[i];
1563         p[i] = y;
1564      }
1565      if (p is aligned) {
1566         for (i = 3; i<N; i++){  # loop 3A
1567           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1568           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1569         }
1570      }
1571      else {
1572         for (i = 3; i<N; i++){  # loop 3B
1573           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1574           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1575         }
1576      }
1577
1578      These loops are later passed to loop_transform to be vectorized.  The
1579      vectorizer will use the alignment information to guide the transformation
1580      (whether to generate regular loads/stores, or with special handling for
1581      misalignment).  */
1582
1583 opt_result
1584 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1585 {
1586   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1587   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1588   enum dr_alignment_support supportable_dr_alignment;
1589   dr_vec_info *first_store = NULL;
1590   dr_vec_info *dr0_info = NULL;
1591   struct data_reference *dr;
1592   unsigned int i, j;
1593   bool do_peeling = false;
1594   bool do_versioning = false;
1595   unsigned int npeel = 0;
1596   bool one_misalignment_known = false;
1597   bool one_misalignment_unknown = false;
1598   bool one_dr_unsupportable = false;
1599   dr_vec_info *unsupportable_dr_info = NULL;
1600   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1601   unsigned possible_npeel_number = 1;
1602   tree vectype;
1603   unsigned int mis, same_align_drs_max = 0;
1604   hash_table<peel_info_hasher> peeling_htab (1);
1605
1606   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1607
1608   /* Reset data so we can safely be called multiple times.  */
1609   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1610   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1611
1612   /* While cost model enhancements are expected in the future, the high level
1613      view of the code at this time is as follows:
1614
1615      A) If there is a misaligned access then see if peeling to align
1616         this access can make all data references satisfy
1617         vect_supportable_dr_alignment.  If so, update data structures
1618         as needed and return true.
1619
1620      B) If peeling wasn't possible and there is a data reference with an
1621         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1622         then see if loop versioning checks can be used to make all data
1623         references satisfy vect_supportable_dr_alignment.  If so, update
1624         data structures as needed and return true.
1625
1626      C) If neither peeling nor versioning were successful then return false if
1627         any data reference does not satisfy vect_supportable_dr_alignment.
1628
1629      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1630
1631      Note, Possibility 3 above (which is peeling and versioning together) is not
1632      being done at this time.  */
1633
1634   /* (1) Peeling to force alignment.  */
1635
1636   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1637      Considerations:
1638      + How many accesses will become aligned due to the peeling
1639      - How many accesses will become unaligned due to the peeling,
1640        and the cost of misaligned accesses.
1641      - The cost of peeling (the extra runtime checks, the increase
1642        in code size).  */
1643
1644   FOR_EACH_VEC_ELT (datarefs, i, dr)
1645     {
1646       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1647       stmt_vec_info stmt_info = dr_info->stmt;
1648
1649       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1650         continue;
1651
1652       /* For interleaving, only the alignment of the first access
1653          matters.  */
1654       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1655           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1656         continue;
1657
1658       /* For scatter-gather or invariant accesses there is nothing
1659          to enhance.  */
1660       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1661           || integer_zerop (DR_STEP (dr)))
1662         continue;
1663
1664       /* Strided accesses perform only component accesses, alignment is
1665          irrelevant for them.  */
1666       if (STMT_VINFO_STRIDED_P (stmt_info)
1667           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1668         continue;
1669
1670       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1671       do_peeling = vector_alignment_reachable_p (dr_info);
1672       if (do_peeling)
1673         {
1674           if (known_alignment_for_access_p (dr_info))
1675             {
1676               unsigned int npeel_tmp = 0;
1677               bool negative = tree_int_cst_compare (DR_STEP (dr),
1678                                                     size_zero_node) < 0;
1679
1680               vectype = STMT_VINFO_VECTYPE (stmt_info);
1681               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1682               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1683               mis = (negative
1684                      ? DR_MISALIGNMENT (dr_info)
1685                      : -DR_MISALIGNMENT (dr_info));
1686               if (DR_MISALIGNMENT (dr_info) != 0)
1687                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1688
1689               /* For multiple types, it is possible that the bigger type access
1690                  will have more than one peeling option.  E.g., a loop with two
1691                  types: one of size (vector size / 4), and the other one of
1692                  size (vector size / 8).  Vectorization factor will 8.  If both
1693                  accesses are misaligned by 3, the first one needs one scalar
1694                  iteration to be aligned, and the second one needs 5.  But the
1695                  first one will be aligned also by peeling 5 scalar
1696                  iterations, and in that case both accesses will be aligned.
1697                  Hence, except for the immediate peeling amount, we also want
1698                  to try to add full vector size, while we don't exceed
1699                  vectorization factor.
1700                  We do this automatically for cost model, since we calculate
1701                  cost for every peeling option.  */
1702               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1703                 {
1704                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1705                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1706                   possible_npeel_number
1707                     = vect_get_num_vectors (nscalars, vectype);
1708
1709                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1710                      allow peeling NELEMENTS.  */
1711                   if (DR_MISALIGNMENT (dr_info) == 0)
1712                     possible_npeel_number++;
1713                 }
1714
1715               /* Save info about DR in the hash table.  Also include peeling
1716                  amounts according to the explanation above.  */
1717               for (j = 0; j < possible_npeel_number; j++)
1718                 {
1719                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1720                                             dr_info, npeel_tmp);
1721                   npeel_tmp += target_align / dr_size;
1722                 }
1723
1724               one_misalignment_known = true;
1725             }
1726           else
1727             {
1728               /* If we don't know any misalignment values, we prefer
1729                  peeling for data-ref that has the maximum number of data-refs
1730                  with the same alignment, unless the target prefers to align
1731                  stores over load.  */
1732               unsigned same_align_drs
1733                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1734               if (!dr0_info
1735                   || same_align_drs_max < same_align_drs)
1736                 {
1737                   same_align_drs_max = same_align_drs;
1738                   dr0_info = dr_info;
1739                 }
1740               /* For data-refs with the same number of related
1741                  accesses prefer the one where the misalign
1742                  computation will be invariant in the outermost loop.  */
1743               else if (same_align_drs_max == same_align_drs)
1744                 {
1745                   struct loop *ivloop0, *ivloop;
1746                   ivloop0 = outermost_invariant_loop_for_expr
1747                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1748                   ivloop = outermost_invariant_loop_for_expr
1749                     (loop, DR_BASE_ADDRESS (dr));
1750                   if ((ivloop && !ivloop0)
1751                       || (ivloop && ivloop0
1752                           && flow_loop_nested_p (ivloop, ivloop0)))
1753                     dr0_info = dr_info;
1754                 }
1755
1756               one_misalignment_unknown = true;
1757
1758               /* Check for data refs with unsupportable alignment that
1759                  can be peeled.  */
1760               if (!supportable_dr_alignment)
1761               {
1762                 one_dr_unsupportable = true;
1763                 unsupportable_dr_info = dr_info;
1764               }
1765
1766               if (!first_store && DR_IS_WRITE (dr))
1767                 first_store = dr_info;
1768             }
1769         }
1770       else
1771         {
1772           if (!aligned_access_p (dr_info))
1773             {
1774               if (dump_enabled_p ())
1775                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776                                  "vector alignment may not be reachable\n");
1777               break;
1778             }
1779         }
1780     }
1781
1782   /* Check if we can possibly peel the loop.  */
1783   if (!vect_can_advance_ivs_p (loop_vinfo)
1784       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1785       || loop->inner)
1786     do_peeling = false;
1787
1788   struct _vect_peel_extended_info peel_for_known_alignment;
1789   struct _vect_peel_extended_info peel_for_unknown_alignment;
1790   struct _vect_peel_extended_info best_peel;
1791
1792   peel_for_unknown_alignment.inside_cost = INT_MAX;
1793   peel_for_unknown_alignment.outside_cost = INT_MAX;
1794   peel_for_unknown_alignment.peel_info.count = 0;
1795
1796   if (do_peeling
1797       && one_misalignment_unknown)
1798     {
1799       /* Check if the target requires to prefer stores over loads, i.e., if
1800          misaligned stores are more expensive than misaligned loads (taking
1801          drs with same alignment into account).  */
1802       unsigned int load_inside_cost = 0;
1803       unsigned int load_outside_cost = 0;
1804       unsigned int store_inside_cost = 0;
1805       unsigned int store_outside_cost = 0;
1806       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1807
1808       stmt_vector_for_cost dummy;
1809       dummy.create (2);
1810       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1811                                       &load_inside_cost,
1812                                       &load_outside_cost,
1813                                       &dummy, &dummy, estimated_npeels, true);
1814       dummy.release ();
1815
1816       if (first_store)
1817         {
1818           dummy.create (2);
1819           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1820                                           &store_inside_cost,
1821                                           &store_outside_cost,
1822                                           &dummy, &dummy,
1823                                           estimated_npeels, true);
1824           dummy.release ();
1825         }
1826       else
1827         {
1828           store_inside_cost = INT_MAX;
1829           store_outside_cost = INT_MAX;
1830         }
1831
1832       if (load_inside_cost > store_inside_cost
1833           || (load_inside_cost == store_inside_cost
1834               && load_outside_cost > store_outside_cost))
1835         {
1836           dr0_info = first_store;
1837           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1838           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1839         }
1840       else
1841         {
1842           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1843           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1844         }
1845
1846       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1847       prologue_cost_vec.create (2);
1848       epilogue_cost_vec.create (2);
1849
1850       int dummy2;
1851       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1852         (loop_vinfo, estimated_npeels, &dummy2,
1853          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1854          &prologue_cost_vec, &epilogue_cost_vec);
1855
1856       prologue_cost_vec.release ();
1857       epilogue_cost_vec.release ();
1858
1859       peel_for_unknown_alignment.peel_info.count = 1
1860         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1861     }
1862
1863   peel_for_unknown_alignment.peel_info.npeel = 0;
1864   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1865
1866   best_peel = peel_for_unknown_alignment;
1867
1868   peel_for_known_alignment.inside_cost = INT_MAX;
1869   peel_for_known_alignment.outside_cost = INT_MAX;
1870   peel_for_known_alignment.peel_info.count = 0;
1871   peel_for_known_alignment.peel_info.dr_info = NULL;
1872
1873   if (do_peeling && one_misalignment_known)
1874     {
1875       /* Peeling is possible, but there is no data access that is not supported
1876          unless aligned.  So we try to choose the best possible peeling from
1877          the hash table.  */
1878       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1879         (&peeling_htab, loop_vinfo);
1880     }
1881
1882   /* Compare costs of peeling for known and unknown alignment. */
1883   if (peel_for_known_alignment.peel_info.dr_info != NULL
1884       && peel_for_unknown_alignment.inside_cost
1885       >= peel_for_known_alignment.inside_cost)
1886     {
1887       best_peel = peel_for_known_alignment;
1888
1889       /* If the best peeling for known alignment has NPEEL == 0, perform no
1890          peeling at all except if there is an unsupportable dr that we can
1891          align.  */
1892       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1893         do_peeling = false;
1894     }
1895
1896   /* If there is an unsupportable data ref, prefer this over all choices so far
1897      since we'd have to discard a chosen peeling except when it accidentally
1898      aligned the unsupportable data ref.  */
1899   if (one_dr_unsupportable)
1900     dr0_info = unsupportable_dr_info;
1901   else if (do_peeling)
1902     {
1903       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1904          TODO: Use nopeel_outside_cost or get rid of it?  */
1905       unsigned nopeel_inside_cost = 0;
1906       unsigned nopeel_outside_cost = 0;
1907
1908       stmt_vector_for_cost dummy;
1909       dummy.create (2);
1910       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1911                                       &nopeel_outside_cost, &dummy, &dummy,
1912                                       0, false);
1913       dummy.release ();
1914
1915       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1916          costs will be recorded.  */
1917       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1918       prologue_cost_vec.create (2);
1919       epilogue_cost_vec.create (2);
1920
1921       int dummy2;
1922       nopeel_outside_cost += vect_get_known_peeling_cost
1923         (loop_vinfo, 0, &dummy2,
1924          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1925          &prologue_cost_vec, &epilogue_cost_vec);
1926
1927       prologue_cost_vec.release ();
1928       epilogue_cost_vec.release ();
1929
1930       npeel = best_peel.peel_info.npeel;
1931       dr0_info = best_peel.peel_info.dr_info;
1932
1933       /* If no peeling is not more expensive than the best peeling we
1934          have so far, don't perform any peeling.  */
1935       if (nopeel_inside_cost <= best_peel.inside_cost)
1936         do_peeling = false;
1937     }
1938
1939   if (do_peeling)
1940     {
1941       stmt_vec_info stmt_info = dr0_info->stmt;
1942       vectype = STMT_VINFO_VECTYPE (stmt_info);
1943
1944       if (known_alignment_for_access_p (dr0_info))
1945         {
1946           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1947                                                 size_zero_node) < 0;
1948           if (!npeel)
1949             {
1950               /* Since it's known at compile time, compute the number of
1951                  iterations in the peeled loop (the peeling factor) for use in
1952                  updating DR_MISALIGNMENT values.  The peeling factor is the
1953                  vectorization factor minus the misalignment as an element
1954                  count.  */
1955               mis = (negative
1956                      ? DR_MISALIGNMENT (dr0_info)
1957                      : -DR_MISALIGNMENT (dr0_info));
1958               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1959               npeel = ((mis & (target_align - 1))
1960                        / vect_get_scalar_dr_size (dr0_info));
1961             }
1962
1963           /* For interleaved data access every iteration accesses all the
1964              members of the group, therefore we divide the number of iterations
1965              by the group size.  */
1966           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1967             npeel /= DR_GROUP_SIZE (stmt_info);
1968
1969           if (dump_enabled_p ())
1970             dump_printf_loc (MSG_NOTE, vect_location,
1971                              "Try peeling by %d\n", npeel);
1972         }
1973
1974       /* Ensure that all datarefs can be vectorized after the peel.  */
1975       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1976         do_peeling = false;
1977
1978       /* Check if all datarefs are supportable and log.  */
1979       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1980         {
1981           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1982           if (!stat)
1983             do_peeling = false;
1984           else
1985             return stat;
1986         }
1987
1988       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1989       if (do_peeling)
1990         {
1991           unsigned max_allowed_peel
1992             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1993           if (max_allowed_peel != (unsigned)-1)
1994             {
1995               unsigned max_peel = npeel;
1996               if (max_peel == 0)
1997                 {
1998                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1999                   max_peel = (target_align
2000                               / vect_get_scalar_dr_size (dr0_info) - 1);
2001                 }
2002               if (max_peel > max_allowed_peel)
2003                 {
2004                   do_peeling = false;
2005                   if (dump_enabled_p ())
2006                     dump_printf_loc (MSG_NOTE, vect_location,
2007                         "Disable peeling, max peels reached: %d\n", max_peel);
2008                 }
2009             }
2010         }
2011
2012       /* Cost model #2 - if peeling may result in a remaining loop not
2013          iterating enough to be vectorized then do not peel.  Since this
2014          is a cost heuristic rather than a correctness decision, use the
2015          most likely runtime value for variable vectorization factors.  */
2016       if (do_peeling
2017           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2018         {
2019           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2020           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2021           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2022               < assumed_vf + max_peel)
2023             do_peeling = false;
2024         }
2025
2026       if (do_peeling)
2027         {
2028           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2029              If the misalignment of DR_i is identical to that of dr0 then set
2030              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2031              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2032              by the peeling factor times the element size of DR_i (MOD the
2033              vectorization factor times the size).  Otherwise, the
2034              misalignment of DR_i must be set to unknown.  */
2035           FOR_EACH_VEC_ELT (datarefs, i, dr)
2036             if (dr != dr0_info->dr)
2037               {
2038                 /* Strided accesses perform only component accesses, alignment
2039                    is irrelevant for them.  */
2040                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2041                 stmt_info = dr_info->stmt;
2042                 if (STMT_VINFO_STRIDED_P (stmt_info)
2043                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2044                   continue;
2045
2046                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2047               }
2048
2049           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2050           if (npeel)
2051             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2052           else
2053             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2054               = DR_MISALIGNMENT (dr0_info);
2055           SET_DR_MISALIGNMENT (dr0_info, 0);
2056           if (dump_enabled_p ())
2057             {
2058               dump_printf_loc (MSG_NOTE, vect_location,
2059                                "Alignment of access forced using peeling.\n");
2060               dump_printf_loc (MSG_NOTE, vect_location,
2061                                "Peeling for alignment will be applied.\n");
2062             }
2063
2064           /* The inside-loop cost will be accounted for in vectorizable_load
2065              and vectorizable_store correctly with adjusted alignments.
2066              Drop the body_cst_vec on the floor here.  */
2067           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2068           gcc_assert (stat);
2069           return stat;
2070         }
2071     }
2072
2073   /* (2) Versioning to force alignment.  */
2074
2075   /* Try versioning if:
2076      1) optimize loop for speed
2077      2) there is at least one unsupported misaligned data ref with an unknown
2078         misalignment, and
2079      3) all misaligned data refs with a known misalignment are supported, and
2080      4) the number of runtime alignment checks is within reason.  */
2081
2082   do_versioning =
2083         optimize_loop_nest_for_speed_p (loop)
2084         && (!loop->inner); /* FORNOW */
2085
2086   if (do_versioning)
2087     {
2088       FOR_EACH_VEC_ELT (datarefs, i, dr)
2089         {
2090           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2091           stmt_vec_info stmt_info = dr_info->stmt;
2092
2093           /* For interleaving, only the alignment of the first access
2094              matters.  */
2095           if (aligned_access_p (dr_info)
2096               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2097                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2098             continue;
2099
2100           if (STMT_VINFO_STRIDED_P (stmt_info))
2101             {
2102               /* Strided loads perform only component accesses, alignment is
2103                  irrelevant for them.  */
2104               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2105                 continue;
2106               do_versioning = false;
2107               break;
2108             }
2109
2110           supportable_dr_alignment
2111             = vect_supportable_dr_alignment (dr_info, false);
2112
2113           if (!supportable_dr_alignment)
2114             {
2115               int mask;
2116               tree vectype;
2117
2118               if (known_alignment_for_access_p (dr_info)
2119                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2120                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2121                 {
2122                   do_versioning = false;
2123                   break;
2124                 }
2125
2126               vectype = STMT_VINFO_VECTYPE (stmt_info);
2127               gcc_assert (vectype);
2128
2129               /* At present we don't support versioning for alignment
2130                  with variable VF, since there's no guarantee that the
2131                  VF is a power of two.  We could relax this if we added
2132                  a way of enforcing a power-of-two size.  */
2133               unsigned HOST_WIDE_INT size;
2134               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2135                 {
2136                   do_versioning = false;
2137                   break;
2138                 }
2139
2140               /* The rightmost bits of an aligned address must be zeros.
2141                  Construct the mask needed for this test.  For example,
2142                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2143                  mask must be 15 = 0xf. */
2144               mask = size - 1;
2145
2146               /* FORNOW: use the same mask to test all potentially unaligned
2147                  references in the loop.  The vectorizer currently supports
2148                  a single vector size, see the reference to
2149                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2150                  vectorization factor is computed.  */
2151               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2152                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2153               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2154               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2155             }
2156         }
2157
2158       /* Versioning requires at least one misaligned data reference.  */
2159       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2160         do_versioning = false;
2161       else if (!do_versioning)
2162         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2163     }
2164
2165   if (do_versioning)
2166     {
2167       vec<stmt_vec_info> may_misalign_stmts
2168         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2169       stmt_vec_info stmt_info;
2170
2171       /* It can now be assumed that the data references in the statements
2172          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2173          of the loop being vectorized.  */
2174       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2175         {
2176           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2177           SET_DR_MISALIGNMENT (dr_info, 0);
2178           if (dump_enabled_p ())
2179             dump_printf_loc (MSG_NOTE, vect_location,
2180                              "Alignment of access forced using versioning.\n");
2181         }
2182
2183       if (dump_enabled_p ())
2184         dump_printf_loc (MSG_NOTE, vect_location,
2185                          "Versioning for alignment will be applied.\n");
2186
2187       /* Peeling and versioning can't be done together at this time.  */
2188       gcc_assert (! (do_peeling && do_versioning));
2189
2190       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2191       gcc_assert (stat);
2192       return stat;
2193     }
2194
2195   /* This point is reached if neither peeling nor versioning is being done.  */
2196   gcc_assert (! (do_peeling || do_versioning));
2197
2198   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2199   return stat;
2200 }
2201
2202
2203 /* Function vect_find_same_alignment_drs.
2204
2205    Update group and alignment relations in VINFO according to the chosen
2206    vectorization factor.  */
2207
2208 static void
2209 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2210 {
2211   struct data_reference *dra = DDR_A (ddr);
2212   struct data_reference *drb = DDR_B (ddr);
2213   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2214   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2215   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2216   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2217
2218   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2219     return;
2220
2221   if (dra == drb)
2222     return;
2223
2224   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2225       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2226     return;
2227
2228   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2229       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2230       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2231     return;
2232
2233   /* Two references with distance zero have the same alignment.  */
2234   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2235                           - wi::to_poly_offset (DR_INIT (drb)));
2236   if (maybe_ne (diff, 0))
2237     {
2238       /* Get the wider of the two alignments.  */
2239       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2240                               / BITS_PER_UNIT);
2241       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2242                               / BITS_PER_UNIT);
2243       unsigned int max_align = MAX (align_a, align_b);
2244
2245       /* Require the gap to be a multiple of the larger vector alignment.  */
2246       if (!multiple_p (diff, max_align))
2247         return;
2248     }
2249
2250   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2251   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2252   if (dump_enabled_p ())
2253     dump_printf_loc (MSG_NOTE, vect_location,
2254                      "accesses have the same alignment: %T and %T\n",
2255                      DR_REF (dra), DR_REF (drb));
2256 }
2257
2258
2259 /* Function vect_analyze_data_refs_alignment
2260
2261    Analyze the alignment of the data-references in the loop.
2262    Return FALSE if a data reference is found that cannot be vectorized.  */
2263
2264 opt_result
2265 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2266 {
2267   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2268
2269   /* Mark groups of data references with same alignment using
2270      data dependence information.  */
2271   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2272   struct data_dependence_relation *ddr;
2273   unsigned int i;
2274
2275   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2276     vect_find_same_alignment_drs (vinfo, ddr);
2277
2278   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2279   struct data_reference *dr;
2280
2281   vect_record_base_alignments (vinfo);
2282   FOR_EACH_VEC_ELT (datarefs, i, dr)
2283     {
2284       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2285       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2286         vect_compute_data_ref_alignment (dr_info);
2287     }
2288
2289   return opt_result::success ();
2290 }
2291
2292
2293 /* Analyze alignment of DRs of stmts in NODE.  */
2294
2295 static bool
2296 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2297 {
2298   /* We vectorize from the first scalar stmt in the node unless
2299      the node is permuted in which case we start from the first
2300      element in the group.  */
2301   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2302   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2303   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2304     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2305
2306   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2307   vect_compute_data_ref_alignment (dr_info);
2308   /* For creating the data-ref pointer we need alignment of the
2309      first element anyway.  */
2310   if (dr_info != first_dr_info)
2311     vect_compute_data_ref_alignment (first_dr_info);
2312   if (! verify_data_ref_alignment (dr_info))
2313     {
2314       if (dump_enabled_p ())
2315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                          "not vectorized: bad data alignment in basic "
2317                          "block.\n");
2318       return false;
2319     }
2320
2321   return true;
2322 }
2323
2324 /* Function vect_slp_analyze_instance_alignment
2325
2326    Analyze the alignment of the data-references in the SLP instance.
2327    Return FALSE if a data reference is found that cannot be vectorized.  */
2328
2329 bool
2330 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2331 {
2332   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2333
2334   slp_tree node;
2335   unsigned i;
2336   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2337     if (! vect_slp_analyze_and_verify_node_alignment (node))
2338       return false;
2339
2340   node = SLP_INSTANCE_TREE (instance);
2341   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2342       && ! vect_slp_analyze_and_verify_node_alignment
2343              (SLP_INSTANCE_TREE (instance)))
2344     return false;
2345
2346   return true;
2347 }
2348
2349
2350 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2351    accesses of legal size, step, etc.  Detect gaps, single element
2352    interleaving, and other special cases. Set grouped access info.
2353    Collect groups of strided stores for further use in SLP analysis.
2354    Worker for vect_analyze_group_access.  */
2355
2356 static bool
2357 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2358 {
2359   data_reference *dr = dr_info->dr;
2360   tree step = DR_STEP (dr);
2361   tree scalar_type = TREE_TYPE (DR_REF (dr));
2362   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2363   stmt_vec_info stmt_info = dr_info->stmt;
2364   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2366   HOST_WIDE_INT dr_step = -1;
2367   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2368   bool slp_impossible = false;
2369
2370   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2371      size of the interleaving group (including gaps).  */
2372   if (tree_fits_shwi_p (step))
2373     {
2374       dr_step = tree_to_shwi (step);
2375       /* Check that STEP is a multiple of type size.  Otherwise there is
2376          a non-element-sized gap at the end of the group which we
2377          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2378          ???  As we can handle non-constant step fine here we should
2379          simply remove uses of DR_GROUP_GAP between the last and first
2380          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2381          simply not include that gap.  */
2382       if ((dr_step % type_size) != 0)
2383         {
2384           if (dump_enabled_p ())
2385             dump_printf_loc (MSG_NOTE, vect_location,
2386                              "Step %T is not a multiple of the element size"
2387                              " for %T\n",
2388                              step, DR_REF (dr));
2389           return false;
2390         }
2391       groupsize = absu_hwi (dr_step) / type_size;
2392     }
2393   else
2394     groupsize = 0;
2395
2396   /* Not consecutive access is possible only if it is a part of interleaving.  */
2397   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2398     {
2399       /* Check if it this DR is a part of interleaving, and is a single
2400          element of the group that is accessed in the loop.  */
2401
2402       /* Gaps are supported only for loads. STEP must be a multiple of the type
2403          size.  */
2404       if (DR_IS_READ (dr)
2405           && (dr_step % type_size) == 0
2406           && groupsize > 0)
2407         {
2408           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2409           DR_GROUP_SIZE (stmt_info) = groupsize;
2410           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2411           if (dump_enabled_p ())
2412             dump_printf_loc (MSG_NOTE, vect_location,
2413                              "Detected single element interleaving %T"
2414                              " step %T\n",
2415                              DR_REF (dr), step);
2416
2417           return true;
2418         }
2419
2420       if (dump_enabled_p ())
2421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422                          "not consecutive access %G", stmt_info->stmt);
2423
2424       if (bb_vinfo)
2425         {
2426           /* Mark the statement as unvectorizable.  */
2427           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2428           return true;
2429         }
2430
2431       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2432       STMT_VINFO_STRIDED_P (stmt_info) = true;
2433       return true;
2434     }
2435
2436   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2437     {
2438       /* First stmt in the interleaving chain. Check the chain.  */
2439       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2440       struct data_reference *data_ref = dr;
2441       unsigned int count = 1;
2442       tree prev_init = DR_INIT (data_ref);
2443       stmt_vec_info prev = stmt_info;
2444       HOST_WIDE_INT diff, gaps = 0;
2445
2446       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2447       while (next)
2448         {
2449           /* Skip same data-refs.  In case that two or more stmts share
2450              data-ref (supported only for loads), we vectorize only the first
2451              stmt, and the rest get their vectorized loads from the first
2452              one.  */
2453           if (!tree_int_cst_compare (DR_INIT (data_ref),
2454                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2455             {
2456               if (DR_IS_WRITE (data_ref))
2457                 {
2458                   if (dump_enabled_p ())
2459                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460                                      "Two store stmts share the same dr.\n");
2461                   return false;
2462                 }
2463
2464               if (dump_enabled_p ())
2465                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2466                                  "Two or more load stmts share the same dr.\n");
2467
2468               /* For load use the same data-ref load.  */
2469               DR_GROUP_SAME_DR_STMT (next) = prev;
2470
2471               prev = next;
2472               next = DR_GROUP_NEXT_ELEMENT (next);
2473               continue;
2474             }
2475
2476           prev = next;
2477           data_ref = STMT_VINFO_DATA_REF (next);
2478
2479           /* All group members have the same STEP by construction.  */
2480           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2481
2482           /* Check that the distance between two accesses is equal to the type
2483              size. Otherwise, we have gaps.  */
2484           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2485                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2486           if (diff != 1)
2487             {
2488               /* FORNOW: SLP of accesses with gaps is not supported.  */
2489               slp_impossible = true;
2490               if (DR_IS_WRITE (data_ref))
2491                 {
2492                   if (dump_enabled_p ())
2493                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494                                      "interleaved store with gaps\n");
2495                   return false;
2496                 }
2497
2498               gaps += diff - 1;
2499             }
2500
2501           last_accessed_element += diff;
2502
2503           /* Store the gap from the previous member of the group. If there is no
2504              gap in the access, DR_GROUP_GAP is always 1.  */
2505           DR_GROUP_GAP (next) = diff;
2506
2507           prev_init = DR_INIT (data_ref);
2508           next = DR_GROUP_NEXT_ELEMENT (next);
2509           /* Count the number of data-refs in the chain.  */
2510           count++;
2511         }
2512
2513       if (groupsize == 0)
2514         groupsize = count + gaps;
2515
2516       /* This could be UINT_MAX but as we are generating code in a very
2517          inefficient way we have to cap earlier.  See PR78699 for example.  */
2518       if (groupsize > 4096)
2519         {
2520           if (dump_enabled_p ())
2521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2522                              "group is too large\n");
2523           return false;
2524         }
2525
2526       /* Check that the size of the interleaving is equal to count for stores,
2527          i.e., that there are no gaps.  */
2528       if (groupsize != count
2529           && !DR_IS_READ (dr))
2530         {
2531           groupsize = count;
2532           STMT_VINFO_STRIDED_P (stmt_info) = true;
2533         }
2534
2535       /* If there is a gap after the last load in the group it is the
2536          difference between the groupsize and the last accessed
2537          element.
2538          When there is no gap, this difference should be 0.  */
2539       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2540
2541       DR_GROUP_SIZE (stmt_info) = groupsize;
2542       if (dump_enabled_p ())
2543         {
2544           dump_printf_loc (MSG_NOTE, vect_location,
2545                            "Detected interleaving ");
2546           if (DR_IS_READ (dr))
2547             dump_printf (MSG_NOTE, "load ");
2548           else if (STMT_VINFO_STRIDED_P (stmt_info))
2549             dump_printf (MSG_NOTE, "strided store ");
2550           else
2551             dump_printf (MSG_NOTE, "store ");
2552           dump_printf (MSG_NOTE, "of size %u starting with %G",
2553                        (unsigned)groupsize, stmt_info->stmt);
2554           if (DR_GROUP_GAP (stmt_info) != 0)
2555             dump_printf_loc (MSG_NOTE, vect_location,
2556                              "There is a gap of %u elements after the group\n",
2557                              DR_GROUP_GAP (stmt_info));
2558         }
2559
2560       /* SLP: create an SLP data structure for every interleaving group of
2561          stores for further analysis in vect_analyse_slp.  */
2562       if (DR_IS_WRITE (dr) && !slp_impossible)
2563         {
2564           if (loop_vinfo)
2565             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2566           if (bb_vinfo)
2567             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2568         }
2569     }
2570
2571   return true;
2572 }
2573
2574 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2575    accesses of legal size, step, etc.  Detect gaps, single element
2576    interleaving, and other special cases. Set grouped access info.
2577    Collect groups of strided stores for further use in SLP analysis.  */
2578
2579 static bool
2580 vect_analyze_group_access (dr_vec_info *dr_info)
2581 {
2582   if (!vect_analyze_group_access_1 (dr_info))
2583     {
2584       /* Dissolve the group if present.  */
2585       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2586       while (stmt_info)
2587         {
2588           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2589           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2590           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2591           stmt_info = next;
2592         }
2593       return false;
2594     }
2595   return true;
2596 }
2597
2598 /* Analyze the access pattern of the data-reference DR_INFO.
2599    In case of non-consecutive accesses call vect_analyze_group_access() to
2600    analyze groups of accesses.  */
2601
2602 static bool
2603 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2604 {
2605   data_reference *dr = dr_info->dr;
2606   tree step = DR_STEP (dr);
2607   tree scalar_type = TREE_TYPE (DR_REF (dr));
2608   stmt_vec_info stmt_info = dr_info->stmt;
2609   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2610   struct loop *loop = NULL;
2611
2612   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2613     return true;
2614
2615   if (loop_vinfo)
2616     loop = LOOP_VINFO_LOOP (loop_vinfo);
2617
2618   if (loop_vinfo && !step)
2619     {
2620       if (dump_enabled_p ())
2621         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2622                          "bad data-ref access in loop\n");
2623       return false;
2624     }
2625
2626   /* Allow loads with zero step in inner-loop vectorization.  */
2627   if (loop_vinfo && integer_zerop (step))
2628     {
2629       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2630       if (!nested_in_vect_loop_p (loop, stmt_info))
2631         return DR_IS_READ (dr);
2632       /* Allow references with zero step for outer loops marked
2633          with pragma omp simd only - it guarantees absence of
2634          loop-carried dependencies between inner loop iterations.  */
2635       if (loop->safelen < 2)
2636         {
2637           if (dump_enabled_p ())
2638             dump_printf_loc (MSG_NOTE, vect_location,
2639                              "zero step in inner loop of nest\n");
2640           return false;
2641         }
2642     }
2643
2644   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2645     {
2646       /* Interleaved accesses are not yet supported within outer-loop
2647         vectorization for references in the inner-loop.  */
2648       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2649
2650       /* For the rest of the analysis we use the outer-loop step.  */
2651       step = STMT_VINFO_DR_STEP (stmt_info);
2652       if (integer_zerop (step))
2653         {
2654           if (dump_enabled_p ())
2655             dump_printf_loc (MSG_NOTE, vect_location,
2656                              "zero step in outer loop.\n");
2657           return DR_IS_READ (dr);
2658         }
2659     }
2660
2661   /* Consecutive?  */
2662   if (TREE_CODE (step) == INTEGER_CST)
2663     {
2664       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2665       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2666           || (dr_step < 0
2667               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2668         {
2669           /* Mark that it is not interleaving.  */
2670           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2671           return true;
2672         }
2673     }
2674
2675   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2676     {
2677       if (dump_enabled_p ())
2678         dump_printf_loc (MSG_NOTE, vect_location,
2679                          "grouped access in outer loop.\n");
2680       return false;
2681     }
2682
2683
2684   /* Assume this is a DR handled by non-constant strided load case.  */
2685   if (TREE_CODE (step) != INTEGER_CST)
2686     return (STMT_VINFO_STRIDED_P (stmt_info)
2687             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2688                 || vect_analyze_group_access (dr_info)));
2689
2690   /* Not consecutive access - check if it's a part of interleaving group.  */
2691   return vect_analyze_group_access (dr_info);
2692 }
2693
2694 /* Compare two data-references DRA and DRB to group them into chunks
2695    suitable for grouping.  */
2696
2697 static int
2698 dr_group_sort_cmp (const void *dra_, const void *drb_)
2699 {
2700   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2701   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2702   int cmp;
2703
2704   /* Stabilize sort.  */
2705   if (dra == drb)
2706     return 0;
2707
2708   /* DRs in different loops never belong to the same group.  */
2709   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2710   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2711   if (loopa != loopb)
2712     return loopa->num < loopb->num ? -1 : 1;
2713
2714   /* Ordering of DRs according to base.  */
2715   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2716                                DR_BASE_ADDRESS (drb));
2717   if (cmp != 0)
2718     return cmp;
2719
2720   /* And according to DR_OFFSET.  */
2721   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2722   if (cmp != 0)
2723     return cmp;
2724
2725   /* Put reads before writes.  */
2726   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2727     return DR_IS_READ (dra) ? -1 : 1;
2728
2729   /* Then sort after access size.  */
2730   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2731                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2732   if (cmp != 0)
2733     return cmp;
2734
2735   /* And after step.  */
2736   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2737   if (cmp != 0)
2738     return cmp;
2739
2740   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2741   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2742   if (cmp == 0)
2743     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2744   return cmp;
2745 }
2746
2747 /* If OP is the result of a conversion, return the unconverted value,
2748    otherwise return null.  */
2749
2750 static tree
2751 strip_conversion (tree op)
2752 {
2753   if (TREE_CODE (op) != SSA_NAME)
2754     return NULL_TREE;
2755   gimple *stmt = SSA_NAME_DEF_STMT (op);
2756   if (!is_gimple_assign (stmt)
2757       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2758     return NULL_TREE;
2759   return gimple_assign_rhs1 (stmt);
2760 }
2761
2762 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2763    and STMT2_INFO being in a single group.  */
2764
2765 static bool
2766 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2767 {
2768   if (gimple_assign_single_p (stmt1_info->stmt))
2769     return gimple_assign_single_p (stmt2_info->stmt);
2770
2771   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2772   if (call1 && gimple_call_internal_p (call1))
2773     {
2774       /* Check for two masked loads or two masked stores.  */
2775       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2776       if (!call2 || !gimple_call_internal_p (call2))
2777         return false;
2778       internal_fn ifn = gimple_call_internal_fn (call1);
2779       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2780         return false;
2781       if (ifn != gimple_call_internal_fn (call2))
2782         return false;
2783
2784       /* Check that the masks are the same.  Cope with casts of masks,
2785          like those created by build_mask_conversion.  */
2786       tree mask1 = gimple_call_arg (call1, 2);
2787       tree mask2 = gimple_call_arg (call2, 2);
2788       if (!operand_equal_p (mask1, mask2, 0))
2789         {
2790           mask1 = strip_conversion (mask1);
2791           if (!mask1)
2792             return false;
2793           mask2 = strip_conversion (mask2);
2794           if (!mask2)
2795             return false;
2796           if (!operand_equal_p (mask1, mask2, 0))
2797             return false;
2798         }
2799       return true;
2800     }
2801
2802   return false;
2803 }
2804
2805 /* Function vect_analyze_data_ref_accesses.
2806
2807    Analyze the access pattern of all the data references in the loop.
2808
2809    FORNOW: the only access pattern that is considered vectorizable is a
2810            simple step 1 (consecutive) access.
2811
2812    FORNOW: handle only arrays and pointer accesses.  */
2813
2814 opt_result
2815 vect_analyze_data_ref_accesses (vec_info *vinfo)
2816 {
2817   unsigned int i;
2818   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2819   struct data_reference *dr;
2820
2821   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2822
2823   if (datarefs.is_empty ())
2824     return opt_result::success ();
2825
2826   /* Sort the array of datarefs to make building the interleaving chains
2827      linear.  Don't modify the original vector's order, it is needed for
2828      determining what dependencies are reversed.  */
2829   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2830   datarefs_copy.qsort (dr_group_sort_cmp);
2831
2832   /* Build the interleaving chains.  */
2833   for (i = 0; i < datarefs_copy.length () - 1;)
2834     {
2835       data_reference_p dra = datarefs_copy[i];
2836       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2837       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2838       stmt_vec_info lastinfo = NULL;
2839       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2840           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2841         {
2842           ++i;
2843           continue;
2844         }
2845       for (i = i + 1; i < datarefs_copy.length (); ++i)
2846         {
2847           data_reference_p drb = datarefs_copy[i];
2848           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2849           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2850           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2851               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2852             break;
2853
2854           /* ???  Imperfect sorting (non-compatible types, non-modulo
2855              accesses, same accesses) can lead to a group to be artificially
2856              split here as we don't just skip over those.  If it really
2857              matters we can push those to a worklist and re-iterate
2858              over them.  The we can just skip ahead to the next DR here.  */
2859
2860           /* DRs in a different loop should not be put into the same
2861              interleaving group.  */
2862           if (gimple_bb (DR_STMT (dra))->loop_father
2863               != gimple_bb (DR_STMT (drb))->loop_father)
2864             break;
2865
2866           /* Check that the data-refs have same first location (except init)
2867              and they are both either store or load (not load and store,
2868              not masked loads or stores).  */
2869           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2870               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2871                                         DR_BASE_ADDRESS (drb)) != 0
2872               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2873               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2874             break;
2875
2876           /* Check that the data-refs have the same constant size.  */
2877           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2878           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2879           if (!tree_fits_uhwi_p (sza)
2880               || !tree_fits_uhwi_p (szb)
2881               || !tree_int_cst_equal (sza, szb))
2882             break;
2883
2884           /* Check that the data-refs have the same step.  */
2885           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2886             break;
2887
2888           /* Check the types are compatible.
2889              ???  We don't distinguish this during sorting.  */
2890           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2891                                    TREE_TYPE (DR_REF (drb))))
2892             break;
2893
2894           /* Check that the DR_INITs are compile-time constants.  */
2895           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2896               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2897             break;
2898
2899           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2900           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2901           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2902           HOST_WIDE_INT init_prev
2903             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2904           gcc_assert (init_a <= init_b
2905                       && init_a <= init_prev
2906                       && init_prev <= init_b);
2907
2908           /* Do not place the same access in the interleaving chain twice.  */
2909           if (init_b == init_prev)
2910             {
2911               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2912                           < gimple_uid (DR_STMT (drb)));
2913               /* ???  For now we simply "drop" the later reference which is
2914                  otherwise the same rather than finishing off this group.
2915                  In the end we'd want to re-process duplicates forming
2916                  multiple groups from the refs, likely by just collecting
2917                  all candidates (including duplicates and split points
2918                  below) in a vector and then process them together.  */
2919               continue;
2920             }
2921
2922           /* If init_b == init_a + the size of the type * k, we have an
2923              interleaving, and DRA is accessed before DRB.  */
2924           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2925           if (type_size_a == 0
2926               || (init_b - init_a) % type_size_a != 0)
2927             break;
2928
2929           /* If we have a store, the accesses are adjacent.  This splits
2930              groups into chunks we support (we don't support vectorization
2931              of stores with gaps).  */
2932           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2933             break;
2934
2935           /* If the step (if not zero or non-constant) is greater than the
2936              difference between data-refs' inits this splits groups into
2937              suitable sizes.  */
2938           if (tree_fits_shwi_p (DR_STEP (dra)))
2939             {
2940               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2941               if (step != 0 && step <= (init_b - init_a))
2942                 break;
2943             }
2944
2945           if (dump_enabled_p ())
2946             dump_printf_loc (MSG_NOTE, vect_location,
2947                              DR_IS_READ (dra)
2948                              ? "Detected interleaving load %T and %T\n"
2949                              : "Detected interleaving store %T and %T\n",
2950                              DR_REF (dra), DR_REF (drb));
2951
2952           /* Link the found element into the group list.  */
2953           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2954             {
2955               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2956               lastinfo = stmtinfo_a;
2957             }
2958           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2959           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2960           lastinfo = stmtinfo_b;
2961         }
2962     }
2963
2964   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2965     {
2966       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2967       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
2968           && !vect_analyze_data_ref_access (dr_info))
2969         {
2970           if (dump_enabled_p ())
2971             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972                              "not vectorized: complicated access pattern.\n");
2973
2974           if (is_a <bb_vec_info> (vinfo))
2975             {
2976               /* Mark the statement as not vectorizable.  */
2977               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
2978               continue;
2979             }
2980           else
2981             {
2982               datarefs_copy.release ();
2983               return opt_result::failure_at (dr_info->stmt->stmt,
2984                                              "not vectorized:"
2985                                              " complicated access pattern.\n");
2986             }
2987         }
2988     }
2989
2990   datarefs_copy.release ();
2991   return opt_result::success ();
2992 }
2993
2994 /* Function vect_vfa_segment_size.
2995
2996    Input:
2997      DR_INFO: The data reference.
2998      LENGTH_FACTOR: segment length to consider.
2999
3000    Return a value suitable for the dr_with_seg_len::seg_len field.
3001    This is the "distance travelled" by the pointer from the first
3002    iteration in the segment to the last.  Note that it does not include
3003    the size of the access; in effect it only describes the first byte.  */
3004
3005 static tree
3006 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3007 {
3008   length_factor = size_binop (MINUS_EXPR,
3009                               fold_convert (sizetype, length_factor),
3010                               size_one_node);
3011   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3012                      length_factor);
3013 }
3014
3015 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3016    gives the worst-case number of bytes covered by the segment.  */
3017
3018 static unsigned HOST_WIDE_INT
3019 vect_vfa_access_size (dr_vec_info *dr_info)
3020 {
3021   stmt_vec_info stmt_vinfo = dr_info->stmt;
3022   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3023   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3024   unsigned HOST_WIDE_INT access_size = ref_size;
3025   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3026     {
3027       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3028       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3029     }
3030   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3031       && (vect_supportable_dr_alignment (dr_info, false)
3032           == dr_explicit_realign_optimized))
3033     {
3034       /* We might access a full vector's worth.  */
3035       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3036       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3037     }
3038   return access_size;
3039 }
3040
3041 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3042    describes.  */
3043
3044 static unsigned int
3045 vect_vfa_align (dr_vec_info *dr_info)
3046 {
3047   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3048 }
3049
3050 /* Function vect_no_alias_p.
3051
3052    Given data references A and B with equal base and offset, see whether
3053    the alias relation can be decided at compilation time.  Return 1 if
3054    it can and the references alias, 0 if it can and the references do
3055    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3056    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3057    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3058
3059 static int
3060 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3061                          tree segment_length_a, tree segment_length_b,
3062                          unsigned HOST_WIDE_INT access_size_a,
3063                          unsigned HOST_WIDE_INT access_size_b)
3064 {
3065   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3066   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3067   poly_uint64 const_length_a;
3068   poly_uint64 const_length_b;
3069
3070   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3071      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3072      [a, a+12) */
3073   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3074     {
3075       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3076       offset_a = (offset_a + access_size_a) - const_length_a;
3077     }
3078   else
3079     const_length_a = tree_to_poly_uint64 (segment_length_a);
3080   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3081     {
3082       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3083       offset_b = (offset_b + access_size_b) - const_length_b;
3084     }
3085   else
3086     const_length_b = tree_to_poly_uint64 (segment_length_b);
3087
3088   const_length_a += access_size_a;
3089   const_length_b += access_size_b;
3090
3091   if (ranges_known_overlap_p (offset_a, const_length_a,
3092                               offset_b, const_length_b))
3093     return 1;
3094
3095   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3096                                offset_b, const_length_b))
3097     return 0;
3098
3099   return -1;
3100 }
3101
3102 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3103    in DDR is >= VF.  */
3104
3105 static bool
3106 dependence_distance_ge_vf (data_dependence_relation *ddr,
3107                            unsigned int loop_depth, poly_uint64 vf)
3108 {
3109   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3110       || DDR_NUM_DIST_VECTS (ddr) == 0)
3111     return false;
3112
3113   /* If the dependence is exact, we should have limited the VF instead.  */
3114   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3115
3116   unsigned int i;
3117   lambda_vector dist_v;
3118   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3119     {
3120       HOST_WIDE_INT dist = dist_v[loop_depth];
3121       if (dist != 0
3122           && !(dist > 0 && DDR_REVERSED_P (ddr))
3123           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3124         return false;
3125     }
3126
3127   if (dump_enabled_p ())
3128     dump_printf_loc (MSG_NOTE, vect_location,
3129                      "dependence distance between %T and %T is >= VF\n",
3130                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3131
3132   return true;
3133 }
3134
3135 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3136
3137 static void
3138 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3139 {
3140   dump_printf (dump_kind, "%s (%T) >= ",
3141                lower_bound.unsigned_p ? "unsigned" : "abs",
3142                lower_bound.expr);
3143   dump_dec (dump_kind, lower_bound.min_value);
3144 }
3145
3146 /* Record that the vectorized loop requires the vec_lower_bound described
3147    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3148
3149 static void
3150 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3151                         poly_uint64 min_value)
3152 {
3153   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3154   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3155     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3156       {
3157         unsigned_p &= lower_bounds[i].unsigned_p;
3158         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3159         if (lower_bounds[i].unsigned_p != unsigned_p
3160             || maybe_lt (lower_bounds[i].min_value, min_value))
3161           {
3162             lower_bounds[i].unsigned_p = unsigned_p;
3163             lower_bounds[i].min_value = min_value;
3164             if (dump_enabled_p ())
3165               {
3166                 dump_printf_loc (MSG_NOTE, vect_location,
3167                                  "updating run-time check to ");
3168                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3169                 dump_printf (MSG_NOTE, "\n");
3170               }
3171           }
3172         return;
3173       }
3174
3175   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3176   if (dump_enabled_p ())
3177     {
3178       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3179       dump_lower_bound (MSG_NOTE, lower_bound);
3180       dump_printf (MSG_NOTE, "\n");
3181     }
3182   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3183 }
3184
3185 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3186    will span fewer than GAP bytes.  */
3187
3188 static bool
3189 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3190                   poly_int64 gap)
3191 {
3192   stmt_vec_info stmt_info = dr_info->stmt;
3193   HOST_WIDE_INT count
3194     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3195   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3196     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3197   return (estimated_poly_value (gap)
3198           <= count * vect_get_scalar_dr_size (dr_info));
3199 }
3200
3201 /* Return true if we know that there is no alias between DR_INFO_A and
3202    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3203    When returning true, set *LOWER_BOUND_OUT to this N.  */
3204
3205 static bool
3206 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3207                                 poly_uint64 *lower_bound_out)
3208 {
3209   /* Check that there is a constant gap of known sign between DR_A
3210      and DR_B.  */
3211   data_reference *dr_a = dr_info_a->dr;
3212   data_reference *dr_b = dr_info_b->dr;
3213   poly_int64 init_a, init_b;
3214   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3215       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3216       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3217       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3218       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3219       || !ordered_p (init_a, init_b))
3220     return false;
3221
3222   /* Sort DR_A and DR_B by the address they access.  */
3223   if (maybe_lt (init_b, init_a))
3224     {
3225       std::swap (init_a, init_b);
3226       std::swap (dr_info_a, dr_info_b);
3227       std::swap (dr_a, dr_b);
3228     }
3229
3230   /* If the two accesses could be dependent within a scalar iteration,
3231      make sure that we'd retain their order.  */
3232   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3233       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3234     return false;
3235
3236   /* There is no alias if abs (DR_STEP) is greater than or equal to
3237      the bytes spanned by the combination of the two accesses.  */
3238   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3239   return true;
3240 }
3241
3242 /* Function vect_prune_runtime_alias_test_list.
3243
3244    Prune a list of ddrs to be tested at run-time by versioning for alias.
3245    Merge several alias checks into one if possible.
3246    Return FALSE if resulting list of ddrs is longer then allowed by
3247    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3248
3249 opt_result
3250 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3251 {
3252   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3253   hash_set <tree_pair_hash> compared_objects;
3254
3255   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3256   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3257     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3258   vec<vec_object_pair> &check_unequal_addrs
3259     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3260   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3261   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3262
3263   ddr_p ddr;
3264   unsigned int i;
3265   tree length_factor;
3266
3267   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3268
3269   /* Step values are irrelevant for aliasing if the number of vector
3270      iterations is equal to the number of scalar iterations (which can
3271      happen for fully-SLP loops).  */
3272   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3273
3274   if (!ignore_step_p)
3275     {
3276       /* Convert the checks for nonzero steps into bound tests.  */
3277       tree value;
3278       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3279         vect_check_lower_bound (loop_vinfo, value, true, 1);
3280     }
3281
3282   if (may_alias_ddrs.is_empty ())
3283     return opt_result::success ();
3284
3285   comp_alias_ddrs.create (may_alias_ddrs.length ());
3286
3287   unsigned int loop_depth
3288     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3289                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3290
3291   /* First, we collect all data ref pairs for aliasing checks.  */
3292   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3293     {
3294       int comp_res;
3295       poly_uint64 lower_bound;
3296       tree segment_length_a, segment_length_b;
3297       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3298       unsigned int align_a, align_b;
3299
3300       /* Ignore the alias if the VF we chose ended up being no greater
3301          than the dependence distance.  */
3302       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3303         continue;
3304
3305       if (DDR_OBJECT_A (ddr))
3306         {
3307           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3308           if (!compared_objects.add (new_pair))
3309             {
3310               if (dump_enabled_p ())
3311                 dump_printf_loc (MSG_NOTE, vect_location,
3312                                  "checking that %T and %T"
3313                                  " have different addresses\n",
3314                                  new_pair.first, new_pair.second);
3315               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3316             }
3317           continue;
3318         }
3319
3320       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3321       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3322
3323       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3324       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3325
3326       /* Skip the pair if inter-iteration dependencies are irrelevant
3327          and intra-iteration dependencies are guaranteed to be honored.  */
3328       if (ignore_step_p
3329           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3330               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3331                                                  &lower_bound)))
3332         {
3333           if (dump_enabled_p ())
3334             dump_printf_loc (MSG_NOTE, vect_location,
3335                              "no need for alias check between "
3336                              "%T and %T when VF is 1\n",
3337                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3338           continue;
3339         }
3340
3341       /* See whether we can handle the alias using a bounds check on
3342          the step, and whether that's likely to be the best approach.
3343          (It might not be, for example, if the minimum step is much larger
3344          than the number of bytes handled by one vector iteration.)  */
3345       if (!ignore_step_p
3346           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3347           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3348                                              &lower_bound)
3349           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3350               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3351         {
3352           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3353           if (dump_enabled_p ())
3354             {
3355               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3356                                "%T and %T when the step %T is outside ",
3357                                DR_REF (dr_info_a->dr),
3358                                DR_REF (dr_info_b->dr),
3359                                DR_STEP (dr_info_a->dr));
3360               if (unsigned_p)
3361                 dump_printf (MSG_NOTE, "[0");
3362               else
3363                 {
3364                   dump_printf (MSG_NOTE, "(");
3365                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3366                 }
3367               dump_printf (MSG_NOTE, ", ");
3368               dump_dec (MSG_NOTE, lower_bound);
3369               dump_printf (MSG_NOTE, ")\n");
3370             }
3371           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3372                                   unsigned_p, lower_bound);
3373           continue;
3374         }
3375
3376       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3377       if (dr_group_first_a)
3378         {
3379           stmt_info_a = dr_group_first_a;
3380           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3381         }
3382
3383       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3384       if (dr_group_first_b)
3385         {
3386           stmt_info_b = dr_group_first_b;
3387           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3388         }
3389
3390       if (ignore_step_p)
3391         {
3392           segment_length_a = size_zero_node;
3393           segment_length_b = size_zero_node;
3394         }
3395       else
3396         {
3397           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3398                                 DR_STEP (dr_info_b->dr), 0))
3399             length_factor = scalar_loop_iters;
3400           else
3401             length_factor = size_int (vect_factor);
3402           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3403           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3404         }
3405       access_size_a = vect_vfa_access_size (dr_info_a);
3406       access_size_b = vect_vfa_access_size (dr_info_b);
3407       align_a = vect_vfa_align (dr_info_a);
3408       align_b = vect_vfa_align (dr_info_b);
3409
3410       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3411                                         DR_BASE_ADDRESS (dr_info_b->dr));
3412       if (comp_res == 0)
3413         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3414                                           DR_OFFSET (dr_info_b->dr));
3415
3416       /* See whether the alias is known at compilation time.  */
3417       if (comp_res == 0
3418           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3419           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3420           && poly_int_tree_p (segment_length_a)
3421           && poly_int_tree_p (segment_length_b))
3422         {
3423           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3424                                              segment_length_a,
3425                                              segment_length_b,
3426                                              access_size_a,
3427                                              access_size_b);
3428           if (res >= 0 && dump_enabled_p ())
3429             {
3430               dump_printf_loc (MSG_NOTE, vect_location,
3431                                "can tell at compile time that %T and %T",
3432                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3433               if (res == 0)
3434                 dump_printf (MSG_NOTE, " do not alias\n");
3435               else
3436                 dump_printf (MSG_NOTE, " alias\n");
3437             }
3438
3439           if (res == 0)
3440             continue;
3441
3442           if (res == 1)
3443             return opt_result::failure_at (stmt_info_b->stmt,
3444                                            "not vectorized:"
3445                                            " compilation time alias: %G%G",
3446                                            stmt_info_a->stmt,
3447                                            stmt_info_b->stmt);
3448         }
3449
3450       dr_with_seg_len_pair_t dr_with_seg_len_pair
3451         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3452                           access_size_a, align_a),
3453          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3454                           access_size_b, align_b));
3455
3456       /* Canonicalize pairs by sorting the two DR members.  */
3457       if (comp_res > 0)
3458         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3459
3460       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3461     }
3462
3463   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3464
3465   unsigned int count = (comp_alias_ddrs.length ()
3466                         + check_unequal_addrs.length ());
3467
3468   dump_printf_loc (MSG_NOTE, vect_location,
3469                    "improved number of alias checks from %d to %d\n",
3470                    may_alias_ddrs.length (), count);
3471   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3472     return opt_result::failure_at
3473       (vect_location,
3474        "number of versioning for alias "
3475        "run-time tests exceeds %d "
3476        "(--param vect-max-version-for-alias-checks)\n",
3477        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3478
3479   return opt_result::success ();
3480 }
3481
3482 /* Check whether we can use an internal function for a gather load
3483    or scatter store.  READ_P is true for loads and false for stores.
3484    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3485    the type of the memory elements being loaded or stored.  OFFSET_BITS
3486    is the number of bits in each scalar offset and OFFSET_SIGN is the
3487    sign of the offset.  SCALE is the amount by which the offset should
3488    be multiplied *after* it has been converted to address width.
3489
3490    Return true if the function is supported, storing the function
3491    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3492
3493 bool
3494 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3495                           tree memory_type, unsigned int offset_bits,
3496                           signop offset_sign, int scale,
3497                           internal_fn *ifn_out, tree *element_type_out)
3498 {
3499   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3500   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3501   if (offset_bits > element_bits)
3502     /* Internal functions require the offset to be the same width as
3503        the vector elements.  We can extend narrower offsets, but it isn't
3504        safe to truncate wider offsets.  */
3505     return false;
3506
3507   if (element_bits != memory_bits)
3508     /* For now the vector elements must be the same width as the
3509        memory elements.  */
3510     return false;
3511
3512   /* Work out which function we need.  */
3513   internal_fn ifn;
3514   if (read_p)
3515     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3516   else
3517     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3518
3519   /* Test whether the target supports this combination.  */
3520   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3521                                                offset_sign, scale))
3522     return false;
3523
3524   *ifn_out = ifn;
3525   *element_type_out = TREE_TYPE (vectype);
3526   return true;
3527 }
3528
3529 /* STMT_INFO is a call to an internal gather load or scatter store function.
3530    Describe the operation in INFO.  */
3531
3532 static void
3533 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3534                                    gather_scatter_info *info)
3535 {
3536   gcall *call = as_a <gcall *> (stmt_info->stmt);
3537   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3538   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3539
3540   info->ifn = gimple_call_internal_fn (call);
3541   info->decl = NULL_TREE;
3542   info->base = gimple_call_arg (call, 0);
3543   info->offset = gimple_call_arg (call, 1);
3544   info->offset_dt = vect_unknown_def_type;
3545   info->offset_vectype = NULL_TREE;
3546   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3547   info->element_type = TREE_TYPE (vectype);
3548   info->memory_type = TREE_TYPE (DR_REF (dr));
3549 }
3550
3551 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3552    gather load or scatter store.  Describe the operation in *INFO if so.  */
3553
3554 bool
3555 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3556                            gather_scatter_info *info)
3557 {
3558   HOST_WIDE_INT scale = 1;
3559   poly_int64 pbitpos, pbitsize;
3560   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3561   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3562   tree offtype = NULL_TREE;
3563   tree decl = NULL_TREE, base, off;
3564   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3565   tree memory_type = TREE_TYPE (DR_REF (dr));
3566   machine_mode pmode;
3567   int punsignedp, reversep, pvolatilep = 0;
3568   internal_fn ifn;
3569   tree element_type;
3570   bool masked_p = false;
3571
3572   /* See whether this is already a call to a gather/scatter internal function.
3573      If not, see whether it's a masked load or store.  */
3574   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3575   if (call && gimple_call_internal_p (call))
3576     {
3577       ifn = gimple_call_internal_fn (call);
3578       if (internal_gather_scatter_fn_p (ifn))
3579         {
3580           vect_describe_gather_scatter_call (stmt_info, info);
3581           return true;
3582         }
3583       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3584     }
3585
3586   /* True if we should aim to use internal functions rather than
3587      built-in functions.  */
3588   bool use_ifn_p = (DR_IS_READ (dr)
3589                     ? supports_vec_gather_load_p ()
3590                     : supports_vec_scatter_store_p ());
3591
3592   base = DR_REF (dr);
3593   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3594      see if we can use the def stmt of the address.  */
3595   if (masked_p
3596       && TREE_CODE (base) == MEM_REF
3597       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3598       && integer_zerop (TREE_OPERAND (base, 1))
3599       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3600     {
3601       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3602       if (is_gimple_assign (def_stmt)
3603           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3604         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3605     }
3606
3607   /* The gather and scatter builtins need address of the form
3608      loop_invariant + vector * {1, 2, 4, 8}
3609      or
3610      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3611      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3612      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3613      multiplications and additions in it.  To get a vector, we need
3614      a single SSA_NAME that will be defined in the loop and will
3615      contain everything that is not loop invariant and that can be
3616      vectorized.  The following code attempts to find such a preexistng
3617      SSA_NAME OFF and put the loop invariants into a tree BASE
3618      that can be gimplified before the loop.  */
3619   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3620                               &punsignedp, &reversep, &pvolatilep);
3621   if (reversep)
3622     return false;
3623
3624   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3625
3626   if (TREE_CODE (base) == MEM_REF)
3627     {
3628       if (!integer_zerop (TREE_OPERAND (base, 1)))
3629         {
3630           if (off == NULL_TREE)
3631             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3632           else
3633             off = size_binop (PLUS_EXPR, off,
3634                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3635         }
3636       base = TREE_OPERAND (base, 0);
3637     }
3638   else
3639     base = build_fold_addr_expr (base);
3640
3641   if (off == NULL_TREE)
3642     off = size_zero_node;
3643
3644   /* If base is not loop invariant, either off is 0, then we start with just
3645      the constant offset in the loop invariant BASE and continue with base
3646      as OFF, otherwise give up.
3647      We could handle that case by gimplifying the addition of base + off
3648      into some SSA_NAME and use that as off, but for now punt.  */
3649   if (!expr_invariant_in_loop_p (loop, base))
3650     {
3651       if (!integer_zerop (off))
3652         return false;
3653       off = base;
3654       base = size_int (pbytepos);
3655     }
3656   /* Otherwise put base + constant offset into the loop invariant BASE
3657      and continue with OFF.  */
3658   else
3659     {
3660       base = fold_convert (sizetype, base);
3661       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3662     }
3663
3664   /* OFF at this point may be either a SSA_NAME or some tree expression
3665      from get_inner_reference.  Try to peel off loop invariants from it
3666      into BASE as long as possible.  */
3667   STRIP_NOPS (off);
3668   while (offtype == NULL_TREE)
3669     {
3670       enum tree_code code;
3671       tree op0, op1, add = NULL_TREE;
3672
3673       if (TREE_CODE (off) == SSA_NAME)
3674         {
3675           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3676
3677           if (expr_invariant_in_loop_p (loop, off))
3678             return false;
3679
3680           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3681             break;
3682
3683           op0 = gimple_assign_rhs1 (def_stmt);
3684           code = gimple_assign_rhs_code (def_stmt);
3685           op1 = gimple_assign_rhs2 (def_stmt);
3686         }
3687       else
3688         {
3689           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3690             return false;
3691           code = TREE_CODE (off);
3692           extract_ops_from_tree (off, &code, &op0, &op1);
3693         }
3694       switch (code)
3695         {
3696         case POINTER_PLUS_EXPR:
3697         case PLUS_EXPR:
3698           if (expr_invariant_in_loop_p (loop, op0))
3699             {
3700               add = op0;
3701               off = op1;
3702             do_add:
3703               add = fold_convert (sizetype, add);
3704               if (scale != 1)
3705                 add = size_binop (MULT_EXPR, add, size_int (scale));
3706               base = size_binop (PLUS_EXPR, base, add);
3707               continue;
3708             }
3709           if (expr_invariant_in_loop_p (loop, op1))
3710             {
3711               add = op1;
3712               off = op0;
3713               goto do_add;
3714             }
3715           break;
3716         case MINUS_EXPR:
3717           if (expr_invariant_in_loop_p (loop, op1))
3718             {
3719               add = fold_convert (sizetype, op1);
3720               add = size_binop (MINUS_EXPR, size_zero_node, add);
3721               off = op0;
3722               goto do_add;
3723             }
3724           break;
3725         case MULT_EXPR:
3726           if (scale == 1 && tree_fits_shwi_p (op1))
3727             {
3728               int new_scale = tree_to_shwi (op1);
3729               /* Only treat this as a scaling operation if the target
3730                  supports it.  */
3731               if (use_ifn_p
3732                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3733                                                 vectype, memory_type, 1,
3734                                                 TYPE_SIGN (TREE_TYPE (op0)),
3735                                                 new_scale, &ifn,
3736                                                 &element_type))
3737                 break;
3738               scale = new_scale;
3739               off = op0;
3740               continue;
3741             }
3742           break;
3743         case SSA_NAME:
3744           off = op0;
3745           continue;
3746         CASE_CONVERT:
3747           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3748               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3749             break;
3750           if (TYPE_PRECISION (TREE_TYPE (op0))
3751               == TYPE_PRECISION (TREE_TYPE (off)))
3752             {
3753               off = op0;
3754               continue;
3755             }
3756
3757           /* The internal functions need the offset to be the same width
3758              as the elements of VECTYPE.  Don't include operations that
3759              cast the offset from that width to a different width.  */
3760           if (use_ifn_p
3761               && (int_size_in_bytes (TREE_TYPE (vectype))
3762                   == int_size_in_bytes (TREE_TYPE (off))))
3763             break;
3764
3765           if (TYPE_PRECISION (TREE_TYPE (op0))
3766               < TYPE_PRECISION (TREE_TYPE (off)))
3767             {
3768               off = op0;
3769               offtype = TREE_TYPE (off);
3770               STRIP_NOPS (off);
3771               continue;
3772             }
3773           break;
3774         default:
3775           break;
3776         }
3777       break;
3778     }
3779
3780   /* If at the end OFF still isn't a SSA_NAME or isn't
3781      defined in the loop, punt.  */
3782   if (TREE_CODE (off) != SSA_NAME
3783       || expr_invariant_in_loop_p (loop, off))
3784     return false;
3785
3786   if (offtype == NULL_TREE)
3787     offtype = TREE_TYPE (off);
3788
3789   if (use_ifn_p)
3790     {
3791       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3792                                      memory_type, TYPE_PRECISION (offtype),
3793                                      TYPE_SIGN (offtype), scale, &ifn,
3794                                      &element_type))
3795         return false;
3796     }
3797   else
3798     {
3799       if (DR_IS_READ (dr))
3800         {
3801           if (targetm.vectorize.builtin_gather)
3802             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3803         }
3804       else
3805         {
3806           if (targetm.vectorize.builtin_scatter)
3807             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3808         }
3809
3810       if (!decl)
3811         return false;
3812
3813       ifn = IFN_LAST;
3814       element_type = TREE_TYPE (vectype);
3815     }
3816
3817   info->ifn = ifn;
3818   info->decl = decl;
3819   info->base = base;
3820   info->offset = off;
3821   info->offset_dt = vect_unknown_def_type;
3822   info->offset_vectype = NULL_TREE;
3823   info->scale = scale;
3824   info->element_type = element_type;
3825   info->memory_type = memory_type;
3826   return true;
3827 }
3828
3829 /* Find the data references in STMT, analyze them with respect to LOOP and
3830    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3831    be handled.  */
3832
3833 opt_result
3834 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3835                                vec<data_reference_p> *datarefs)
3836 {
3837   /* We can ignore clobbers for dataref analysis - they are removed during
3838      loop vectorization and BB vectorization checks dependences with a
3839      stmt walk.  */
3840   if (gimple_clobber_p (stmt))
3841     return opt_result::success ();
3842
3843   if (gimple_has_volatile_ops (stmt))
3844     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3845                                    stmt);
3846
3847   if (stmt_can_throw_internal (stmt))
3848     return opt_result::failure_at (stmt,
3849                                    "not vectorized:"
3850                                    " statement can throw an exception: %G",
3851                                    stmt);
3852
3853   auto_vec<data_reference_p, 2> refs;
3854   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3855   if (!res)
3856     return res;
3857
3858   if (refs.is_empty ())
3859     return opt_result::success ();
3860
3861   if (refs.length () > 1)
3862     return opt_result::failure_at (stmt,
3863                                    "not vectorized:"
3864                                    " more than one data ref in stmt: %G", stmt);
3865
3866   if (gcall *call = dyn_cast <gcall *> (stmt))
3867     if (!gimple_call_internal_p (call)
3868         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3869             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3870       return opt_result::failure_at (stmt,
3871                                      "not vectorized: dr in a call %G", stmt);
3872
3873   data_reference_p dr = refs.pop ();
3874   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3875       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3876     return opt_result::failure_at (stmt,
3877                                    "not vectorized:"
3878                                    " statement is bitfield access %G", stmt);
3879
3880   if (DR_BASE_ADDRESS (dr)
3881       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3882     return opt_result::failure_at (stmt,
3883                                    "not vectorized:"
3884                                    " base addr of dr is a constant\n");
3885
3886   /* Check whether this may be a SIMD lane access and adjust the
3887      DR to make it easier for us to handle it.  */
3888   if (loop
3889       && loop->simduid
3890       && (!DR_BASE_ADDRESS (dr)
3891           || !DR_OFFSET (dr)
3892           || !DR_INIT (dr)
3893           || !DR_STEP (dr)))
3894     {
3895       struct data_reference *newdr
3896         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3897                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3898       if (DR_BASE_ADDRESS (newdr)
3899           && DR_OFFSET (newdr)
3900           && DR_INIT (newdr)
3901           && DR_STEP (newdr)
3902           && integer_zerop (DR_STEP (newdr)))
3903         {
3904           tree off = DR_OFFSET (newdr);
3905           STRIP_NOPS (off);
3906           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3907               && TREE_CODE (off) == MULT_EXPR
3908               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3909             {
3910               tree step = TREE_OPERAND (off, 1);
3911               off = TREE_OPERAND (off, 0);
3912               STRIP_NOPS (off);
3913               if (CONVERT_EXPR_P (off)
3914                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3915                       < TYPE_PRECISION (TREE_TYPE (off))))
3916                 off = TREE_OPERAND (off, 0);
3917               if (TREE_CODE (off) == SSA_NAME)
3918                 {
3919                   gimple *def = SSA_NAME_DEF_STMT (off);
3920                   tree reft = TREE_TYPE (DR_REF (newdr));
3921                   if (is_gimple_call (def)
3922                       && gimple_call_internal_p (def)
3923                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3924                     {
3925                       tree arg = gimple_call_arg (def, 0);
3926                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
3927                       arg = SSA_NAME_VAR (arg);
3928                       if (arg == loop->simduid
3929                           /* For now.  */
3930                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3931                         {
3932                           DR_OFFSET (newdr) = ssize_int (0);
3933                           DR_STEP (newdr) = step;
3934                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3935                           DR_STEP_ALIGNMENT (newdr)
3936                             = highest_pow2_factor (step);
3937                           /* Mark as simd-lane access.  */
3938                           newdr->aux = (void *)-1;
3939                           free_data_ref (dr);
3940                           datarefs->safe_push (newdr);
3941                           return opt_result::success ();
3942                         }
3943                     }
3944                 }
3945             }
3946         }
3947       free_data_ref (newdr);
3948     }
3949
3950   datarefs->safe_push (dr);
3951   return opt_result::success ();
3952 }
3953
3954 /* Function vect_analyze_data_refs.
3955
3956   Find all the data references in the loop or basic block.
3957
3958    The general structure of the analysis of data refs in the vectorizer is as
3959    follows:
3960    1- vect_analyze_data_refs(loop/bb): call
3961       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3962       in the loop/bb and their dependences.
3963    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3964    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3965    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3966
3967 */
3968
3969 opt_result
3970 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3971 {
3972   struct loop *loop = NULL;
3973   unsigned int i;
3974   struct data_reference *dr;
3975   tree scalar_type;
3976
3977   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
3978
3979   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3980     loop = LOOP_VINFO_LOOP (loop_vinfo);
3981
3982   /* Go through the data-refs, check that the analysis succeeded.  Update
3983      pointer from stmt_vec_info struct to DR and vectype.  */
3984
3985   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3986   FOR_EACH_VEC_ELT (datarefs, i, dr)
3987     {
3988       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3989       poly_uint64 vf;
3990
3991       gcc_assert (DR_REF (dr));
3992       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
3993       gcc_assert (!stmt_info->dr_aux.dr);
3994       stmt_info->dr_aux.dr = dr;
3995       stmt_info->dr_aux.stmt = stmt_info;
3996
3997       /* Check that analysis of the data-ref succeeded.  */
3998       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3999           || !DR_STEP (dr))
4000         {
4001           bool maybe_gather
4002             = DR_IS_READ (dr)
4003               && !TREE_THIS_VOLATILE (DR_REF (dr))
4004               && (targetm.vectorize.builtin_gather != NULL
4005                   || supports_vec_gather_load_p ());
4006           bool maybe_scatter
4007             = DR_IS_WRITE (dr)
4008               && !TREE_THIS_VOLATILE (DR_REF (dr))
4009               && (targetm.vectorize.builtin_scatter != NULL
4010                   || supports_vec_scatter_store_p ());
4011
4012           /* If target supports vector gather loads or scatter stores,
4013              see if they can't be used.  */
4014           if (is_a <loop_vec_info> (vinfo)
4015               && !nested_in_vect_loop_p (loop, stmt_info))
4016             {
4017               if (maybe_gather || maybe_scatter)
4018                 {
4019                   if (maybe_gather)
4020                     gatherscatter = GATHER;
4021                   else
4022                     gatherscatter = SCATTER;
4023                 }
4024             }
4025
4026           if (gatherscatter == SG_NONE)
4027             {
4028               if (dump_enabled_p ())
4029                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4030                                  "not vectorized: data ref analysis "
4031                                  "failed %G", stmt_info->stmt);
4032               if (is_a <bb_vec_info> (vinfo))
4033                 {
4034                   /* In BB vectorization the ref can still participate
4035                      in dependence analysis, we just can't vectorize it.  */
4036                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4037                   continue;
4038                 }
4039               return opt_result::failure_at (stmt_info->stmt,
4040                                              "not vectorized:"
4041                                              " data ref analysis failed: %G",
4042                                              stmt_info->stmt);
4043             }
4044         }
4045
4046       /* See if this was detected as SIMD lane access.  */
4047       if (dr->aux == (void *)-1)
4048         {
4049           if (nested_in_vect_loop_p (loop, stmt_info))
4050             return opt_result::failure_at (stmt_info->stmt,
4051                                            "not vectorized:"
4052                                            " data ref analysis failed: %G",
4053                                            stmt_info->stmt);
4054           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4055         }
4056
4057       tree base = get_base_address (DR_REF (dr));
4058       if (base && VAR_P (base) && DECL_NONALIASED (base))
4059         {
4060           if (dump_enabled_p ())
4061             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4062                              "not vectorized: base object not addressable "
4063                              "for stmt: %G", stmt_info->stmt);
4064           if (is_a <bb_vec_info> (vinfo))
4065             {
4066               /* In BB vectorization the ref can still participate
4067                  in dependence analysis, we just can't vectorize it.  */
4068               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4069               continue;
4070             }
4071           return opt_result::failure_at (stmt_info->stmt,
4072                                          "not vectorized: base object not"
4073                                          " addressable for stmt: %G",
4074                                          stmt_info->stmt);
4075         }
4076
4077       if (is_a <loop_vec_info> (vinfo)
4078           && DR_STEP (dr)
4079           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4080         {
4081           if (nested_in_vect_loop_p (loop, stmt_info))
4082             return opt_result::failure_at (stmt_info->stmt,
4083                                            "not vectorized:"
4084                                            "not suitable for strided load %G",
4085                                            stmt_info->stmt);
4086           STMT_VINFO_STRIDED_P (stmt_info) = true;
4087         }
4088
4089       /* Update DR field in stmt_vec_info struct.  */
4090
4091       /* If the dataref is in an inner-loop of the loop that is considered for
4092          for vectorization, we also want to analyze the access relative to
4093          the outer-loop (DR contains information only relative to the
4094          inner-most enclosing loop).  We do that by building a reference to the
4095          first location accessed by the inner-loop, and analyze it relative to
4096          the outer-loop.  */
4097       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4098         {
4099           /* Build a reference to the first location accessed by the
4100              inner loop: *(BASE + INIT + OFFSET).  By construction,
4101              this address must be invariant in the inner loop, so we
4102              can consider it as being used in the outer loop.  */
4103           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4104           tree offset = unshare_expr (DR_OFFSET (dr));
4105           tree init = unshare_expr (DR_INIT (dr));
4106           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4107                                           init, offset);
4108           tree init_addr = fold_build_pointer_plus (base, init_offset);
4109           tree init_ref = build_fold_indirect_ref (init_addr);
4110
4111           if (dump_enabled_p ())
4112             dump_printf_loc (MSG_NOTE, vect_location,
4113                              "analyze in outer loop: %T\n", init_ref);
4114
4115           opt_result res
4116             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4117                                     init_ref, loop, stmt_info->stmt);
4118           if (!res)
4119             /* dr_analyze_innermost already explained the failure.  */
4120             return res;
4121
4122           if (dump_enabled_p ())
4123             dump_printf_loc (MSG_NOTE, vect_location,
4124                              "\touter base_address: %T\n"
4125                              "\touter offset from base address: %T\n"
4126                              "\touter constant offset from base address: %T\n"
4127                              "\touter step: %T\n"
4128                              "\touter base alignment: %d\n\n"
4129                              "\touter base misalignment: %d\n"
4130                              "\touter offset alignment: %d\n"
4131                              "\touter step alignment: %d\n",
4132                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4133                              STMT_VINFO_DR_OFFSET (stmt_info),
4134                              STMT_VINFO_DR_INIT (stmt_info),
4135                              STMT_VINFO_DR_STEP (stmt_info),
4136                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4137                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4138                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4139                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4140         }
4141
4142       /* Set vectype for STMT.  */
4143       scalar_type = TREE_TYPE (DR_REF (dr));
4144       STMT_VINFO_VECTYPE (stmt_info)
4145         = get_vectype_for_scalar_type (scalar_type);
4146       if (!STMT_VINFO_VECTYPE (stmt_info))
4147         {
4148           if (dump_enabled_p ())
4149             {
4150               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4151                                "not vectorized: no vectype for stmt: %G",
4152                                stmt_info->stmt);
4153               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4154               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4155                                  scalar_type);
4156               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4157             }
4158
4159           if (is_a <bb_vec_info> (vinfo))
4160             {
4161               /* No vector type is fine, the ref can still participate
4162                  in dependence analysis, we just can't vectorize it.  */
4163               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4164               continue;
4165             }
4166           return opt_result::failure_at (stmt_info->stmt,
4167                                          "not vectorized:"
4168                                          " no vectype for stmt: %G"
4169                                          " scalar_type: %T\n",
4170                                          stmt_info->stmt, scalar_type);
4171         }
4172       else
4173         {
4174           if (dump_enabled_p ())
4175             dump_printf_loc (MSG_NOTE, vect_location,
4176                              "got vectype for stmt: %G%T\n",
4177                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4178         }
4179
4180       /* Adjust the minimal vectorization factor according to the
4181          vector type.  */
4182       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4183       *min_vf = upper_bound (*min_vf, vf);
4184
4185       if (gatherscatter != SG_NONE)
4186         {
4187           gather_scatter_info gs_info;
4188           if (!vect_check_gather_scatter (stmt_info,
4189                                           as_a <loop_vec_info> (vinfo),
4190                                           &gs_info)
4191               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4192             return opt_result::failure_at
4193               (stmt_info->stmt,
4194                (gatherscatter == GATHER) ?
4195                "not vectorized: not suitable for gather load %G" :
4196                "not vectorized: not suitable for scatter store %G",
4197                stmt_info->stmt);
4198           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4199         }
4200     }
4201
4202   /* We used to stop processing and prune the list here.  Verify we no
4203      longer need to.  */
4204   gcc_assert (i == datarefs.length ());
4205
4206   return opt_result::success ();
4207 }
4208
4209
4210 /* Function vect_get_new_vect_var.
4211
4212    Returns a name for a new variable.  The current naming scheme appends the
4213    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4214    the name of vectorizer generated variables, and appends that to NAME if
4215    provided.  */
4216
4217 tree
4218 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4219 {
4220   const char *prefix;
4221   tree new_vect_var;
4222
4223   switch (var_kind)
4224   {
4225   case vect_simple_var:
4226     prefix = "vect";
4227     break;
4228   case vect_scalar_var:
4229     prefix = "stmp";
4230     break;
4231   case vect_mask_var:
4232     prefix = "mask";
4233     break;
4234   case vect_pointer_var:
4235     prefix = "vectp";
4236     break;
4237   default:
4238     gcc_unreachable ();
4239   }
4240
4241   if (name)
4242     {
4243       char* tmp = concat (prefix, "_", name, NULL);
4244       new_vect_var = create_tmp_reg (type, tmp);
4245       free (tmp);
4246     }
4247   else
4248     new_vect_var = create_tmp_reg (type, prefix);
4249
4250   return new_vect_var;
4251 }
4252
4253 /* Like vect_get_new_vect_var but return an SSA name.  */
4254
4255 tree
4256 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4257 {
4258   const char *prefix;
4259   tree new_vect_var;
4260
4261   switch (var_kind)
4262   {
4263   case vect_simple_var:
4264     prefix = "vect";
4265     break;
4266   case vect_scalar_var:
4267     prefix = "stmp";
4268     break;
4269   case vect_pointer_var:
4270     prefix = "vectp";
4271     break;
4272   default:
4273     gcc_unreachable ();
4274   }
4275
4276   if (name)
4277     {
4278       char* tmp = concat (prefix, "_", name, NULL);
4279       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4280       free (tmp);
4281     }
4282   else
4283     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4284
4285   return new_vect_var;
4286 }
4287
4288 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4289
4290 static void
4291 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4292 {
4293   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4294   int misalign = DR_MISALIGNMENT (dr_info);
4295   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4296     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4297   else
4298     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4299                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4300 }
4301
4302 /* Function vect_create_addr_base_for_vector_ref.
4303
4304    Create an expression that computes the address of the first memory location
4305    that will be accessed for a data reference.
4306
4307    Input:
4308    STMT_INFO: The statement containing the data reference.
4309    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4310    OFFSET: Optional. If supplied, it is be added to the initial address.
4311    LOOP:    Specify relative to which loop-nest should the address be computed.
4312             For example, when the dataref is in an inner-loop nested in an
4313             outer-loop that is now being vectorized, LOOP can be either the
4314             outer-loop, or the inner-loop.  The first memory location accessed
4315             by the following dataref ('in' points to short):
4316
4317                 for (i=0; i<N; i++)
4318                    for (j=0; j<M; j++)
4319                      s += in[i+j]
4320
4321             is as follows:
4322             if LOOP=i_loop:     &in             (relative to i_loop)
4323             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4324    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4325             initial address.  Unlike OFFSET, which is number of elements to
4326             be added, BYTE_OFFSET is measured in bytes.
4327
4328    Output:
4329    1. Return an SSA_NAME whose value is the address of the memory location of
4330       the first vector of the data reference.
4331    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4332       these statement(s) which define the returned SSA_NAME.
4333
4334    FORNOW: We are only handling array accesses with step 1.  */
4335
4336 tree
4337 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4338                                       gimple_seq *new_stmt_list,
4339                                       tree offset,
4340                                       tree byte_offset)
4341 {
4342   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4343   struct data_reference *dr = dr_info->dr;
4344   const char *base_name;
4345   tree addr_base;
4346   tree dest;
4347   gimple_seq seq = NULL;
4348   tree vect_ptr_type;
4349   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4350   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4351   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4352
4353   tree data_ref_base = unshare_expr (drb->base_address);
4354   tree base_offset = unshare_expr (drb->offset);
4355   tree init = unshare_expr (drb->init);
4356
4357   if (loop_vinfo)
4358     base_name = get_name (data_ref_base);
4359   else
4360     {
4361       base_offset = ssize_int (0);
4362       init = ssize_int (0);
4363       base_name = get_name (DR_REF (dr));
4364     }
4365
4366   /* Create base_offset */
4367   base_offset = size_binop (PLUS_EXPR,
4368                             fold_convert (sizetype, base_offset),
4369                             fold_convert (sizetype, init));
4370
4371   if (offset)
4372     {
4373       offset = fold_build2 (MULT_EXPR, sizetype,
4374                             fold_convert (sizetype, offset), step);
4375       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4376                                  base_offset, offset);
4377     }
4378   if (byte_offset)
4379     {
4380       byte_offset = fold_convert (sizetype, byte_offset);
4381       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4382                                  base_offset, byte_offset);
4383     }
4384
4385   /* base + base_offset */
4386   if (loop_vinfo)
4387     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4388   else
4389     {
4390       addr_base = build1 (ADDR_EXPR,
4391                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4392                           unshare_expr (DR_REF (dr)));
4393     }
4394
4395   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4396   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4397   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4398   gimple_seq_add_seq (new_stmt_list, seq);
4399
4400   if (DR_PTR_INFO (dr)
4401       && TREE_CODE (addr_base) == SSA_NAME
4402       && !SSA_NAME_PTR_INFO (addr_base))
4403     {
4404       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4405       if (offset || byte_offset)
4406         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4407     }
4408
4409   if (dump_enabled_p ())
4410     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4411
4412   return addr_base;
4413 }
4414
4415
4416 /* Function vect_create_data_ref_ptr.
4417
4418    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4419    location accessed in the loop by STMT_INFO, along with the def-use update
4420    chain to appropriately advance the pointer through the loop iterations.
4421    Also set aliasing information for the pointer.  This pointer is used by
4422    the callers to this function to create a memory reference expression for
4423    vector load/store access.
4424
4425    Input:
4426    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4427          GIMPLE_ASSIGN <name, data-ref> or
4428          GIMPLE_ASSIGN <data-ref, name>.
4429    2. AGGR_TYPE: the type of the reference, which should be either a vector
4430         or an array.
4431    3. AT_LOOP: the loop where the vector memref is to be created.
4432    4. OFFSET (optional): an offset to be added to the initial address accessed
4433         by the data-ref in STMT_INFO.
4434    5. BSI: location where the new stmts are to be placed if there is no loop
4435    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4436         pointing to the initial address.
4437    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4438         to the initial address accessed by the data-ref in STMT_INFO.  This is
4439         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4440         in bytes.
4441    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4442         to the IV during each iteration of the loop.  NULL says to move
4443         by one copy of AGGR_TYPE up or down, depending on the step of the
4444         data reference.
4445
4446    Output:
4447    1. Declare a new ptr to vector_type, and have it point to the base of the
4448       data reference (initial addressed accessed by the data reference).
4449       For example, for vector of type V8HI, the following code is generated:
4450
4451       v8hi *ap;
4452       ap = (v8hi *)initial_address;
4453
4454       if OFFSET is not supplied:
4455          initial_address = &a[init];
4456       if OFFSET is supplied:
4457          initial_address = &a[init + OFFSET];
4458       if BYTE_OFFSET is supplied:
4459          initial_address = &a[init] + BYTE_OFFSET;
4460
4461       Return the initial_address in INITIAL_ADDRESS.
4462
4463    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4464       update the pointer in each iteration of the loop.
4465
4466       Return the increment stmt that updates the pointer in PTR_INCR.
4467
4468    3. Return the pointer.  */
4469
4470 tree
4471 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4472                           struct loop *at_loop, tree offset,
4473                           tree *initial_address, gimple_stmt_iterator *gsi,
4474                           gimple **ptr_incr, bool only_init,
4475                           tree byte_offset, tree iv_step)
4476 {
4477   const char *base_name;
4478   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4479   struct loop *loop = NULL;
4480   bool nested_in_vect_loop = false;
4481   struct loop *containing_loop = NULL;
4482   tree aggr_ptr_type;
4483   tree aggr_ptr;
4484   tree new_temp;
4485   gimple_seq new_stmt_list = NULL;
4486   edge pe = NULL;
4487   basic_block new_bb;
4488   tree aggr_ptr_init;
4489   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4490   struct data_reference *dr = dr_info->dr;
4491   tree aptr;
4492   gimple_stmt_iterator incr_gsi;
4493   bool insert_after;
4494   tree indx_before_incr, indx_after_incr;
4495   gimple *incr;
4496   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4497
4498   gcc_assert (iv_step != NULL_TREE
4499               || TREE_CODE (aggr_type) == ARRAY_TYPE
4500               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4501
4502   if (loop_vinfo)
4503     {
4504       loop = LOOP_VINFO_LOOP (loop_vinfo);
4505       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4506       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4507       pe = loop_preheader_edge (loop);
4508     }
4509   else
4510     {
4511       gcc_assert (bb_vinfo);
4512       only_init = true;
4513       *ptr_incr = NULL;
4514     }
4515
4516   /* Create an expression for the first address accessed by this load
4517      in LOOP.  */
4518   base_name = get_name (DR_BASE_ADDRESS (dr));
4519
4520   if (dump_enabled_p ())
4521     {
4522       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4523       dump_printf_loc (MSG_NOTE, vect_location,
4524                        "create %s-pointer variable to type: %T",
4525                        get_tree_code_name (TREE_CODE (aggr_type)),
4526                        aggr_type);
4527       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4528         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4529       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4530         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4531       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4532         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4533       else
4534         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4535       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4536     }
4537
4538   /* (1) Create the new aggregate-pointer variable.
4539      Vector and array types inherit the alias set of their component
4540      type by default so we need to use a ref-all pointer if the data
4541      reference does not conflict with the created aggregated data
4542      reference because it is not addressable.  */
4543   bool need_ref_all = false;
4544   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4545                               get_alias_set (DR_REF (dr))))
4546     need_ref_all = true;
4547   /* Likewise for any of the data references in the stmt group.  */
4548   else if (DR_GROUP_SIZE (stmt_info) > 1)
4549     {
4550       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4551       do
4552         {
4553           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4554           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4555                                       get_alias_set (DR_REF (sdr))))
4556             {
4557               need_ref_all = true;
4558               break;
4559             }
4560           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4561         }
4562       while (sinfo);
4563     }
4564   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4565                                                need_ref_all);
4566   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4567
4568
4569   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4570      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4571      def-use update cycles for the pointer: one relative to the outer-loop
4572      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4573      to the inner-loop (which is the inner-most loop containing the dataref),
4574      and this is done be step (5) below.
4575
4576      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4577      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4578      redundant.  Steps (3),(4) create the following:
4579
4580         vp0 = &base_addr;
4581         LOOP:   vp1 = phi(vp0,vp2)
4582                 ...
4583                 ...
4584                 vp2 = vp1 + step
4585                 goto LOOP
4586
4587      If there is an inner-loop nested in loop, then step (5) will also be
4588      applied, and an additional update in the inner-loop will be created:
4589
4590         vp0 = &base_addr;
4591         LOOP:   vp1 = phi(vp0,vp2)
4592                 ...
4593         inner:     vp3 = phi(vp1,vp4)
4594                    vp4 = vp3 + inner_step
4595                    if () goto inner
4596                 ...
4597                 vp2 = vp1 + step
4598                 if () goto LOOP   */
4599
4600   /* (2) Calculate the initial address of the aggregate-pointer, and set
4601      the aggregate-pointer to point to it before the loop.  */
4602
4603   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4604
4605   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4606                                                    offset, byte_offset);
4607   if (new_stmt_list)
4608     {
4609       if (pe)
4610         {
4611           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4612           gcc_assert (!new_bb);
4613         }
4614       else
4615         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4616     }
4617
4618   *initial_address = new_temp;
4619   aggr_ptr_init = new_temp;
4620
4621   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4622      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4623      inner-loop nested in LOOP (during outer-loop vectorization).  */
4624
4625   /* No update in loop is required.  */
4626   if (only_init && (!loop_vinfo || at_loop == loop))
4627     aptr = aggr_ptr_init;
4628   else
4629     {
4630       /* Accesses to invariant addresses should be handled specially
4631          by the caller.  */
4632       tree step = vect_dr_behavior (dr_info)->step;
4633       gcc_assert (!integer_zerop (step));
4634
4635       if (iv_step == NULL_TREE)
4636         {
4637           /* The step of the aggregate pointer is the type size,
4638              negated for downward accesses.  */
4639           iv_step = TYPE_SIZE_UNIT (aggr_type);
4640           if (tree_int_cst_sgn (step) == -1)
4641             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4642         }
4643
4644       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4645
4646       create_iv (aggr_ptr_init,
4647                  fold_convert (aggr_ptr_type, iv_step),
4648                  aggr_ptr, loop, &incr_gsi, insert_after,
4649                  &indx_before_incr, &indx_after_incr);
4650       incr = gsi_stmt (incr_gsi);
4651       loop_vinfo->add_stmt (incr);
4652
4653       /* Copy the points-to information if it exists. */
4654       if (DR_PTR_INFO (dr))
4655         {
4656           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4657           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4658         }
4659       if (ptr_incr)
4660         *ptr_incr = incr;
4661
4662       aptr = indx_before_incr;
4663     }
4664
4665   if (!nested_in_vect_loop || only_init)
4666     return aptr;
4667
4668
4669   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4670      nested in LOOP, if exists.  */
4671
4672   gcc_assert (nested_in_vect_loop);
4673   if (!only_init)
4674     {
4675       standard_iv_increment_position (containing_loop, &incr_gsi,
4676                                       &insert_after);
4677       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4678                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4679                  &indx_after_incr);
4680       incr = gsi_stmt (incr_gsi);
4681       loop_vinfo->add_stmt (incr);
4682
4683       /* Copy the points-to information if it exists. */
4684       if (DR_PTR_INFO (dr))
4685         {
4686           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4687           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4688         }
4689       if (ptr_incr)
4690         *ptr_incr = incr;
4691
4692       return indx_before_incr;
4693     }
4694   else
4695     gcc_unreachable ();
4696 }
4697
4698
4699 /* Function bump_vector_ptr
4700
4701    Increment a pointer (to a vector type) by vector-size. If requested,
4702    i.e. if PTR-INCR is given, then also connect the new increment stmt
4703    to the existing def-use update-chain of the pointer, by modifying
4704    the PTR_INCR as illustrated below:
4705
4706    The pointer def-use update-chain before this function:
4707                         DATAREF_PTR = phi (p_0, p_2)
4708                         ....
4709         PTR_INCR:       p_2 = DATAREF_PTR + step
4710
4711    The pointer def-use update-chain after this function:
4712                         DATAREF_PTR = phi (p_0, p_2)
4713                         ....
4714                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4715                         ....
4716         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4717
4718    Input:
4719    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4720                  in the loop.
4721    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4722               the loop.  The increment amount across iterations is expected
4723               to be vector_size.
4724    BSI - location where the new update stmt is to be placed.
4725    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4726    BUMP - optional. The offset by which to bump the pointer. If not given,
4727           the offset is assumed to be vector_size.
4728
4729    Output: Return NEW_DATAREF_PTR as illustrated above.
4730
4731 */
4732
4733 tree
4734 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4735                  stmt_vec_info stmt_info, tree bump)
4736 {
4737   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4738   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4739   tree update = TYPE_SIZE_UNIT (vectype);
4740   gassign *incr_stmt;
4741   ssa_op_iter iter;
4742   use_operand_p use_p;
4743   tree new_dataref_ptr;
4744
4745   if (bump)
4746     update = bump;
4747
4748   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4749     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4750   else
4751     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4752   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4753                                    dataref_ptr, update);
4754   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4755
4756   /* Copy the points-to information if it exists. */
4757   if (DR_PTR_INFO (dr))
4758     {
4759       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4760       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4761     }
4762
4763   if (!ptr_incr)
4764     return new_dataref_ptr;
4765
4766   /* Update the vector-pointer's cross-iteration increment.  */
4767   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4768     {
4769       tree use = USE_FROM_PTR (use_p);
4770
4771       if (use == dataref_ptr)
4772         SET_USE (use_p, new_dataref_ptr);
4773       else
4774         gcc_assert (operand_equal_p (use, update, 0));
4775     }
4776
4777   return new_dataref_ptr;
4778 }
4779
4780
4781 /* Copy memory reference info such as base/clique from the SRC reference
4782    to the DEST MEM_REF.  */
4783
4784 void
4785 vect_copy_ref_info (tree dest, tree src)
4786 {
4787   if (TREE_CODE (dest) != MEM_REF)
4788     return;
4789
4790   tree src_base = src;
4791   while (handled_component_p (src_base))
4792     src_base = TREE_OPERAND (src_base, 0);
4793   if (TREE_CODE (src_base) != MEM_REF
4794       && TREE_CODE (src_base) != TARGET_MEM_REF)
4795     return;
4796
4797   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4798   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4799 }
4800
4801
4802 /* Function vect_create_destination_var.
4803
4804    Create a new temporary of type VECTYPE.  */
4805
4806 tree
4807 vect_create_destination_var (tree scalar_dest, tree vectype)
4808 {
4809   tree vec_dest;
4810   const char *name;
4811   char *new_name;
4812   tree type;
4813   enum vect_var_kind kind;
4814
4815   kind = vectype
4816     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4817     ? vect_mask_var
4818     : vect_simple_var
4819     : vect_scalar_var;
4820   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4821
4822   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4823
4824   name = get_name (scalar_dest);
4825   if (name)
4826     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4827   else
4828     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4829   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4830   free (new_name);
4831
4832   return vec_dest;
4833 }
4834
4835 /* Function vect_grouped_store_supported.
4836
4837    Returns TRUE if interleave high and interleave low permutations
4838    are supported, and FALSE otherwise.  */
4839
4840 bool
4841 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4842 {
4843   machine_mode mode = TYPE_MODE (vectype);
4844
4845   /* vect_permute_store_chain requires the group size to be equal to 3 or
4846      be a power of two.  */
4847   if (count != 3 && exact_log2 (count) == -1)
4848     {
4849       if (dump_enabled_p ())
4850         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4851                          "the size of the group of accesses"
4852                          " is not a power of 2 or not eqaul to 3\n");
4853       return false;
4854     }
4855
4856   /* Check that the permutation is supported.  */
4857   if (VECTOR_MODE_P (mode))
4858     {
4859       unsigned int i;
4860       if (count == 3)
4861         {
4862           unsigned int j0 = 0, j1 = 0, j2 = 0;
4863           unsigned int i, j;
4864
4865           unsigned int nelt;
4866           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4867             {
4868               if (dump_enabled_p ())
4869                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4870                                  "cannot handle groups of 3 stores for"
4871                                  " variable-length vectors\n");
4872               return false;
4873             }
4874
4875           vec_perm_builder sel (nelt, nelt, 1);
4876           sel.quick_grow (nelt);
4877           vec_perm_indices indices;
4878           for (j = 0; j < 3; j++)
4879             {
4880               int nelt0 = ((3 - j) * nelt) % 3;
4881               int nelt1 = ((3 - j) * nelt + 1) % 3;
4882               int nelt2 = ((3 - j) * nelt + 2) % 3;
4883               for (i = 0; i < nelt; i++)
4884                 {
4885                   if (3 * i + nelt0 < nelt)
4886                     sel[3 * i + nelt0] = j0++;
4887                   if (3 * i + nelt1 < nelt)
4888                     sel[3 * i + nelt1] = nelt + j1++;
4889                   if (3 * i + nelt2 < nelt)
4890                     sel[3 * i + nelt2] = 0;
4891                 }
4892               indices.new_vector (sel, 2, nelt);
4893               if (!can_vec_perm_const_p (mode, indices))
4894                 {
4895                   if (dump_enabled_p ())
4896                     dump_printf (MSG_MISSED_OPTIMIZATION,
4897                                  "permutation op not supported by target.\n");
4898                   return false;
4899                 }
4900
4901               for (i = 0; i < nelt; i++)
4902                 {
4903                   if (3 * i + nelt0 < nelt)
4904                     sel[3 * i + nelt0] = 3 * i + nelt0;
4905                   if (3 * i + nelt1 < nelt)
4906                     sel[3 * i + nelt1] = 3 * i + nelt1;
4907                   if (3 * i + nelt2 < nelt)
4908                     sel[3 * i + nelt2] = nelt + j2++;
4909                 }
4910               indices.new_vector (sel, 2, nelt);
4911               if (!can_vec_perm_const_p (mode, indices))
4912                 {
4913                   if (dump_enabled_p ())
4914                     dump_printf (MSG_MISSED_OPTIMIZATION,
4915                                  "permutation op not supported by target.\n");
4916                   return false;
4917                 }
4918             }
4919           return true;
4920         }
4921       else
4922         {
4923           /* If length is not equal to 3 then only power of 2 is supported.  */
4924           gcc_assert (pow2p_hwi (count));
4925           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4926
4927           /* The encoding has 2 interleaved stepped patterns.  */
4928           vec_perm_builder sel (nelt, 2, 3);
4929           sel.quick_grow (6);
4930           for (i = 0; i < 3; i++)
4931             {
4932               sel[i * 2] = i;
4933               sel[i * 2 + 1] = i + nelt;
4934             }
4935           vec_perm_indices indices (sel, 2, nelt);
4936           if (can_vec_perm_const_p (mode, indices))
4937             {
4938               for (i = 0; i < 6; i++)
4939                 sel[i] += exact_div (nelt, 2);
4940               indices.new_vector (sel, 2, nelt);
4941               if (can_vec_perm_const_p (mode, indices))
4942                 return true;
4943             }
4944         }
4945     }
4946
4947   if (dump_enabled_p ())
4948     dump_printf (MSG_MISSED_OPTIMIZATION,
4949                  "permutation op not supported by target.\n");
4950   return false;
4951 }
4952
4953
4954 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
4955    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
4956
4957 bool
4958 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
4959                             bool masked_p)
4960 {
4961   if (masked_p)
4962     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
4963                                          vec_mask_store_lanes_optab,
4964                                          vectype, count);
4965   else
4966     return vect_lanes_optab_supported_p ("vec_store_lanes",
4967                                          vec_store_lanes_optab,
4968                                          vectype, count);
4969 }
4970
4971
4972 /* Function vect_permute_store_chain.
4973
4974    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4975    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4976    the data correctly for the stores.  Return the final references for stores
4977    in RESULT_CHAIN.
4978
4979    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4980    The input is 4 vectors each containing 8 elements.  We assign a number to
4981    each element, the input sequence is:
4982
4983    1st vec:   0  1  2  3  4  5  6  7
4984    2nd vec:   8  9 10 11 12 13 14 15
4985    3rd vec:  16 17 18 19 20 21 22 23
4986    4th vec:  24 25 26 27 28 29 30 31
4987
4988    The output sequence should be:
4989
4990    1st vec:  0  8 16 24  1  9 17 25
4991    2nd vec:  2 10 18 26  3 11 19 27
4992    3rd vec:  4 12 20 28  5 13 21 30
4993    4th vec:  6 14 22 30  7 15 23 31
4994
4995    i.e., we interleave the contents of the four vectors in their order.
4996
4997    We use interleave_high/low instructions to create such output.  The input of
4998    each interleave_high/low operation is two vectors:
4999    1st vec    2nd vec
5000    0 1 2 3    4 5 6 7
5001    the even elements of the result vector are obtained left-to-right from the
5002    high/low elements of the first vector.  The odd elements of the result are
5003    obtained left-to-right from the high/low elements of the second vector.
5004    The output of interleave_high will be:   0 4 1 5
5005    and of interleave_low:                   2 6 3 7
5006
5007
5008    The permutation is done in log LENGTH stages.  In each stage interleave_high
5009    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5010    where the first argument is taken from the first half of DR_CHAIN and the
5011    second argument from it's second half.
5012    In our example,
5013
5014    I1: interleave_high (1st vec, 3rd vec)
5015    I2: interleave_low (1st vec, 3rd vec)
5016    I3: interleave_high (2nd vec, 4th vec)
5017    I4: interleave_low (2nd vec, 4th vec)
5018
5019    The output for the first stage is:
5020
5021    I1:  0 16  1 17  2 18  3 19
5022    I2:  4 20  5 21  6 22  7 23
5023    I3:  8 24  9 25 10 26 11 27
5024    I4: 12 28 13 29 14 30 15 31
5025
5026    The output of the second stage, i.e. the final result is:
5027
5028    I1:  0  8 16 24  1  9 17 25
5029    I2:  2 10 18 26  3 11 19 27
5030    I3:  4 12 20 28  5 13 21 30
5031    I4:  6 14 22 30  7 15 23 31.  */
5032
5033 void
5034 vect_permute_store_chain (vec<tree> dr_chain,
5035                           unsigned int length,
5036                           stmt_vec_info stmt_info,
5037                           gimple_stmt_iterator *gsi,
5038                           vec<tree> *result_chain)
5039 {
5040   tree vect1, vect2, high, low;
5041   gimple *perm_stmt;
5042   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5043   tree perm_mask_low, perm_mask_high;
5044   tree data_ref;
5045   tree perm3_mask_low, perm3_mask_high;
5046   unsigned int i, j, n, log_length = exact_log2 (length);
5047
5048   result_chain->quick_grow (length);
5049   memcpy (result_chain->address (), dr_chain.address (),
5050           length * sizeof (tree));
5051
5052   if (length == 3)
5053     {
5054       /* vect_grouped_store_supported ensures that this is constant.  */
5055       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5056       unsigned int j0 = 0, j1 = 0, j2 = 0;
5057
5058       vec_perm_builder sel (nelt, nelt, 1);
5059       sel.quick_grow (nelt);
5060       vec_perm_indices indices;
5061       for (j = 0; j < 3; j++)
5062         {
5063           int nelt0 = ((3 - j) * nelt) % 3;
5064           int nelt1 = ((3 - j) * nelt + 1) % 3;
5065           int nelt2 = ((3 - j) * nelt + 2) % 3;
5066
5067           for (i = 0; i < nelt; i++)
5068             {
5069               if (3 * i + nelt0 < nelt)
5070                 sel[3 * i + nelt0] = j0++;
5071               if (3 * i + nelt1 < nelt)
5072                 sel[3 * i + nelt1] = nelt + j1++;
5073               if (3 * i + nelt2 < nelt)
5074                 sel[3 * i + nelt2] = 0;
5075             }
5076           indices.new_vector (sel, 2, nelt);
5077           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5078
5079           for (i = 0; i < nelt; i++)
5080             {
5081               if (3 * i + nelt0 < nelt)
5082                 sel[3 * i + nelt0] = 3 * i + nelt0;
5083               if (3 * i + nelt1 < nelt)
5084                 sel[3 * i + nelt1] = 3 * i + nelt1;
5085               if (3 * i + nelt2 < nelt)
5086                 sel[3 * i + nelt2] = nelt + j2++;
5087             }
5088           indices.new_vector (sel, 2, nelt);
5089           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5090
5091           vect1 = dr_chain[0];
5092           vect2 = dr_chain[1];
5093
5094           /* Create interleaving stmt:
5095              low = VEC_PERM_EXPR <vect1, vect2,
5096                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5097                                    j + 2, nelt + j + 2, *, ...}>  */
5098           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5099           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5100                                            vect2, perm3_mask_low);
5101           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5102
5103           vect1 = data_ref;
5104           vect2 = dr_chain[2];
5105           /* Create interleaving stmt:
5106              low = VEC_PERM_EXPR <vect1, vect2,
5107                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5108                                    6, 7, nelt + j + 2, ...}>  */
5109           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5110           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5111                                            vect2, perm3_mask_high);
5112           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5113           (*result_chain)[j] = data_ref;
5114         }
5115     }
5116   else
5117     {
5118       /* If length is not equal to 3 then only power of 2 is supported.  */
5119       gcc_assert (pow2p_hwi (length));
5120
5121       /* The encoding has 2 interleaved stepped patterns.  */
5122       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5123       vec_perm_builder sel (nelt, 2, 3);
5124       sel.quick_grow (6);
5125       for (i = 0; i < 3; i++)
5126         {
5127           sel[i * 2] = i;
5128           sel[i * 2 + 1] = i + nelt;
5129         }
5130         vec_perm_indices indices (sel, 2, nelt);
5131         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5132
5133         for (i = 0; i < 6; i++)
5134           sel[i] += exact_div (nelt, 2);
5135         indices.new_vector (sel, 2, nelt);
5136         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5137
5138         for (i = 0, n = log_length; i < n; i++)
5139           {
5140             for (j = 0; j < length/2; j++)
5141               {
5142                 vect1 = dr_chain[j];
5143                 vect2 = dr_chain[j+length/2];
5144
5145                 /* Create interleaving stmt:
5146                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5147                                                         ...}>  */
5148                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5149                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5150                                                  vect2, perm_mask_high);
5151                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5152                 (*result_chain)[2*j] = high;
5153
5154                 /* Create interleaving stmt:
5155                    low = VEC_PERM_EXPR <vect1, vect2,
5156                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5157                                          ...}>  */
5158                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5159                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5160                                                  vect2, perm_mask_low);
5161                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5162                 (*result_chain)[2*j+1] = low;
5163               }
5164             memcpy (dr_chain.address (), result_chain->address (),
5165                     length * sizeof (tree));
5166           }
5167     }
5168 }
5169
5170 /* Function vect_setup_realignment
5171
5172    This function is called when vectorizing an unaligned load using
5173    the dr_explicit_realign[_optimized] scheme.
5174    This function generates the following code at the loop prolog:
5175
5176       p = initial_addr;
5177    x  msq_init = *(floor(p));   # prolog load
5178       realignment_token = call target_builtin;
5179     loop:
5180    x  msq = phi (msq_init, ---)
5181
5182    The stmts marked with x are generated only for the case of
5183    dr_explicit_realign_optimized.
5184
5185    The code above sets up a new (vector) pointer, pointing to the first
5186    location accessed by STMT_INFO, and a "floor-aligned" load using that
5187    pointer.  It also generates code to compute the "realignment-token"
5188    (if the relevant target hook was defined), and creates a phi-node at the
5189    loop-header bb whose arguments are the result of the prolog-load (created
5190    by this function) and the result of a load that takes place in the loop
5191    (to be created by the caller to this function).
5192
5193    For the case of dr_explicit_realign_optimized:
5194    The caller to this function uses the phi-result (msq) to create the
5195    realignment code inside the loop, and sets up the missing phi argument,
5196    as follows:
5197     loop:
5198       msq = phi (msq_init, lsq)
5199       lsq = *(floor(p'));        # load in loop
5200       result = realign_load (msq, lsq, realignment_token);
5201
5202    For the case of dr_explicit_realign:
5203     loop:
5204       msq = *(floor(p));        # load in loop
5205       p' = p + (VS-1);
5206       lsq = *(floor(p'));       # load in loop
5207       result = realign_load (msq, lsq, realignment_token);
5208
5209    Input:
5210    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5211                a memory location that may be unaligned.
5212    BSI - place where new code is to be inserted.
5213    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5214                               is used.
5215
5216    Output:
5217    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5218                        target hook, if defined.
5219    Return value - the result of the loop-header phi node.  */
5220
5221 tree
5222 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5223                         tree *realignment_token,
5224                         enum dr_alignment_support alignment_support_scheme,
5225                         tree init_addr,
5226                         struct loop **at_loop)
5227 {
5228   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5229   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5230   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5231   struct data_reference *dr = dr_info->dr;
5232   struct loop *loop = NULL;
5233   edge pe = NULL;
5234   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5235   tree vec_dest;
5236   gimple *inc;
5237   tree ptr;
5238   tree data_ref;
5239   basic_block new_bb;
5240   tree msq_init = NULL_TREE;
5241   tree new_temp;
5242   gphi *phi_stmt;
5243   tree msq = NULL_TREE;
5244   gimple_seq stmts = NULL;
5245   bool compute_in_loop = false;
5246   bool nested_in_vect_loop = false;
5247   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5248   struct loop *loop_for_initial_load = NULL;
5249
5250   if (loop_vinfo)
5251     {
5252       loop = LOOP_VINFO_LOOP (loop_vinfo);
5253       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5254     }
5255
5256   gcc_assert (alignment_support_scheme == dr_explicit_realign
5257               || alignment_support_scheme == dr_explicit_realign_optimized);
5258
5259   /* We need to generate three things:
5260      1. the misalignment computation
5261      2. the extra vector load (for the optimized realignment scheme).
5262      3. the phi node for the two vectors from which the realignment is
5263       done (for the optimized realignment scheme).  */
5264
5265   /* 1. Determine where to generate the misalignment computation.
5266
5267      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5268      calculation will be generated by this function, outside the loop (in the
5269      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5270      caller, inside the loop.
5271
5272      Background: If the misalignment remains fixed throughout the iterations of
5273      the loop, then both realignment schemes are applicable, and also the
5274      misalignment computation can be done outside LOOP.  This is because we are
5275      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5276      are a multiple of VS (the Vector Size), and therefore the misalignment in
5277      different vectorized LOOP iterations is always the same.
5278      The problem arises only if the memory access is in an inner-loop nested
5279      inside LOOP, which is now being vectorized using outer-loop vectorization.
5280      This is the only case when the misalignment of the memory access may not
5281      remain fixed throughout the iterations of the inner-loop (as explained in
5282      detail in vect_supportable_dr_alignment).  In this case, not only is the
5283      optimized realignment scheme not applicable, but also the misalignment
5284      computation (and generation of the realignment token that is passed to
5285      REALIGN_LOAD) have to be done inside the loop.
5286
5287      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5288      or not, which in turn determines if the misalignment is computed inside
5289      the inner-loop, or outside LOOP.  */
5290
5291   if (init_addr != NULL_TREE || !loop_vinfo)
5292     {
5293       compute_in_loop = true;
5294       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5295     }
5296
5297
5298   /* 2. Determine where to generate the extra vector load.
5299
5300      For the optimized realignment scheme, instead of generating two vector
5301      loads in each iteration, we generate a single extra vector load in the
5302      preheader of the loop, and in each iteration reuse the result of the
5303      vector load from the previous iteration.  In case the memory access is in
5304      an inner-loop nested inside LOOP, which is now being vectorized using
5305      outer-loop vectorization, we need to determine whether this initial vector
5306      load should be generated at the preheader of the inner-loop, or can be
5307      generated at the preheader of LOOP.  If the memory access has no evolution
5308      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5309      to be generated inside LOOP (in the preheader of the inner-loop).  */
5310
5311   if (nested_in_vect_loop)
5312     {
5313       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5314       bool invariant_in_outerloop =
5315             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5316       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5317     }
5318   else
5319     loop_for_initial_load = loop;
5320   if (at_loop)
5321     *at_loop = loop_for_initial_load;
5322
5323   if (loop_for_initial_load)
5324     pe = loop_preheader_edge (loop_for_initial_load);
5325
5326   /* 3. For the case of the optimized realignment, create the first vector
5327       load at the loop preheader.  */
5328
5329   if (alignment_support_scheme == dr_explicit_realign_optimized)
5330     {
5331       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5332       gassign *new_stmt;
5333
5334       gcc_assert (!compute_in_loop);
5335       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5336       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5337                                       loop_for_initial_load, NULL_TREE,
5338                                       &init_addr, NULL, &inc, true);
5339       if (TREE_CODE (ptr) == SSA_NAME)
5340         new_temp = copy_ssa_name (ptr);
5341       else
5342         new_temp = make_ssa_name (TREE_TYPE (ptr));
5343       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5344       new_stmt = gimple_build_assign
5345                    (new_temp, BIT_AND_EXPR, ptr,
5346                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5347       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5348       gcc_assert (!new_bb);
5349       data_ref
5350         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5351                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5352       vect_copy_ref_info (data_ref, DR_REF (dr));
5353       new_stmt = gimple_build_assign (vec_dest, data_ref);
5354       new_temp = make_ssa_name (vec_dest, new_stmt);
5355       gimple_assign_set_lhs (new_stmt, new_temp);
5356       if (pe)
5357         {
5358           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5359           gcc_assert (!new_bb);
5360         }
5361       else
5362          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5363
5364       msq_init = gimple_assign_lhs (new_stmt);
5365     }
5366
5367   /* 4. Create realignment token using a target builtin, if available.
5368       It is done either inside the containing loop, or before LOOP (as
5369       determined above).  */
5370
5371   if (targetm.vectorize.builtin_mask_for_load)
5372     {
5373       gcall *new_stmt;
5374       tree builtin_decl;
5375
5376       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5377       if (!init_addr)
5378         {
5379           /* Generate the INIT_ADDR computation outside LOOP.  */
5380           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5381                                                             NULL_TREE);
5382           if (loop)
5383             {
5384               pe = loop_preheader_edge (loop);
5385               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5386               gcc_assert (!new_bb);
5387             }
5388           else
5389              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5390         }
5391
5392       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5393       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5394       vec_dest =
5395         vect_create_destination_var (scalar_dest,
5396                                      gimple_call_return_type (new_stmt));
5397       new_temp = make_ssa_name (vec_dest, new_stmt);
5398       gimple_call_set_lhs (new_stmt, new_temp);
5399
5400       if (compute_in_loop)
5401         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5402       else
5403         {
5404           /* Generate the misalignment computation outside LOOP.  */
5405           pe = loop_preheader_edge (loop);
5406           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5407           gcc_assert (!new_bb);
5408         }
5409
5410       *realignment_token = gimple_call_lhs (new_stmt);
5411
5412       /* The result of the CALL_EXPR to this builtin is determined from
5413          the value of the parameter and no global variables are touched
5414          which makes the builtin a "const" function.  Requiring the
5415          builtin to have the "const" attribute makes it unnecessary
5416          to call mark_call_clobbered.  */
5417       gcc_assert (TREE_READONLY (builtin_decl));
5418     }
5419
5420   if (alignment_support_scheme == dr_explicit_realign)
5421     return msq;
5422
5423   gcc_assert (!compute_in_loop);
5424   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5425
5426
5427   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5428
5429   pe = loop_preheader_edge (containing_loop);
5430   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5431   msq = make_ssa_name (vec_dest);
5432   phi_stmt = create_phi_node (msq, containing_loop->header);
5433   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5434
5435   return msq;
5436 }
5437
5438
5439 /* Function vect_grouped_load_supported.
5440
5441    COUNT is the size of the load group (the number of statements plus the
5442    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5443    only one statement, with a gap of COUNT - 1.
5444
5445    Returns true if a suitable permute exists.  */
5446
5447 bool
5448 vect_grouped_load_supported (tree vectype, bool single_element_p,
5449                              unsigned HOST_WIDE_INT count)
5450 {
5451   machine_mode mode = TYPE_MODE (vectype);
5452
5453   /* If this is single-element interleaving with an element distance
5454      that leaves unused vector loads around punt - we at least create
5455      very sub-optimal code in that case (and blow up memory,
5456      see PR65518).  */
5457   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5458     {
5459       if (dump_enabled_p ())
5460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5461                          "single-element interleaving not supported "
5462                          "for not adjacent vector loads\n");
5463       return false;
5464     }
5465
5466   /* vect_permute_load_chain requires the group size to be equal to 3 or
5467      be a power of two.  */
5468   if (count != 3 && exact_log2 (count) == -1)
5469     {
5470       if (dump_enabled_p ())
5471         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5472                          "the size of the group of accesses"
5473                          " is not a power of 2 or not equal to 3\n");
5474       return false;
5475     }
5476
5477   /* Check that the permutation is supported.  */
5478   if (VECTOR_MODE_P (mode))
5479     {
5480       unsigned int i, j;
5481       if (count == 3)
5482         {
5483           unsigned int nelt;
5484           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5485             {
5486               if (dump_enabled_p ())
5487                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5488                                  "cannot handle groups of 3 loads for"
5489                                  " variable-length vectors\n");
5490               return false;
5491             }
5492
5493           vec_perm_builder sel (nelt, nelt, 1);
5494           sel.quick_grow (nelt);
5495           vec_perm_indices indices;
5496           unsigned int k;
5497           for (k = 0; k < 3; k++)
5498             {
5499               for (i = 0; i < nelt; i++)
5500                 if (3 * i + k < 2 * nelt)
5501                   sel[i] = 3 * i + k;
5502                 else
5503                   sel[i] = 0;
5504               indices.new_vector (sel, 2, nelt);
5505               if (!can_vec_perm_const_p (mode, indices))
5506                 {
5507                   if (dump_enabled_p ())
5508                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5509                                      "shuffle of 3 loads is not supported by"
5510                                      " target\n");
5511                   return false;
5512                 }
5513               for (i = 0, j = 0; i < nelt; i++)
5514                 if (3 * i + k < 2 * nelt)
5515                   sel[i] = i;
5516                 else
5517                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5518               indices.new_vector (sel, 2, nelt);
5519               if (!can_vec_perm_const_p (mode, indices))
5520                 {
5521                   if (dump_enabled_p ())
5522                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523                                      "shuffle of 3 loads is not supported by"
5524                                      " target\n");
5525                   return false;
5526                 }
5527             }
5528           return true;
5529         }
5530       else
5531         {
5532           /* If length is not equal to 3 then only power of 2 is supported.  */
5533           gcc_assert (pow2p_hwi (count));
5534           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5535
5536           /* The encoding has a single stepped pattern.  */
5537           vec_perm_builder sel (nelt, 1, 3);
5538           sel.quick_grow (3);
5539           for (i = 0; i < 3; i++)
5540             sel[i] = i * 2;
5541           vec_perm_indices indices (sel, 2, nelt);
5542           if (can_vec_perm_const_p (mode, indices))
5543             {
5544               for (i = 0; i < 3; i++)
5545                 sel[i] = i * 2 + 1;
5546               indices.new_vector (sel, 2, nelt);
5547               if (can_vec_perm_const_p (mode, indices))
5548                 return true;
5549             }
5550         }
5551     }
5552
5553   if (dump_enabled_p ())
5554     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5555                      "extract even/odd not supported by target\n");
5556   return false;
5557 }
5558
5559 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5560    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5561
5562 bool
5563 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5564                            bool masked_p)
5565 {
5566   if (masked_p)
5567     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5568                                          vec_mask_load_lanes_optab,
5569                                          vectype, count);
5570   else
5571     return vect_lanes_optab_supported_p ("vec_load_lanes",
5572                                          vec_load_lanes_optab,
5573                                          vectype, count);
5574 }
5575
5576 /* Function vect_permute_load_chain.
5577
5578    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5579    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5580    the input data correctly.  Return the final references for loads in
5581    RESULT_CHAIN.
5582
5583    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5584    The input is 4 vectors each containing 8 elements. We assign a number to each
5585    element, the input sequence is:
5586
5587    1st vec:   0  1  2  3  4  5  6  7
5588    2nd vec:   8  9 10 11 12 13 14 15
5589    3rd vec:  16 17 18 19 20 21 22 23
5590    4th vec:  24 25 26 27 28 29 30 31
5591
5592    The output sequence should be:
5593
5594    1st vec:  0 4  8 12 16 20 24 28
5595    2nd vec:  1 5  9 13 17 21 25 29
5596    3rd vec:  2 6 10 14 18 22 26 30
5597    4th vec:  3 7 11 15 19 23 27 31
5598
5599    i.e., the first output vector should contain the first elements of each
5600    interleaving group, etc.
5601
5602    We use extract_even/odd instructions to create such output.  The input of
5603    each extract_even/odd operation is two vectors
5604    1st vec    2nd vec
5605    0 1 2 3    4 5 6 7
5606
5607    and the output is the vector of extracted even/odd elements.  The output of
5608    extract_even will be:   0 2 4 6
5609    and of extract_odd:     1 3 5 7
5610
5611
5612    The permutation is done in log LENGTH stages.  In each stage extract_even
5613    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5614    their order.  In our example,
5615
5616    E1: extract_even (1st vec, 2nd vec)
5617    E2: extract_odd (1st vec, 2nd vec)
5618    E3: extract_even (3rd vec, 4th vec)
5619    E4: extract_odd (3rd vec, 4th vec)
5620
5621    The output for the first stage will be:
5622
5623    E1:  0  2  4  6  8 10 12 14
5624    E2:  1  3  5  7  9 11 13 15
5625    E3: 16 18 20 22 24 26 28 30
5626    E4: 17 19 21 23 25 27 29 31
5627
5628    In order to proceed and create the correct sequence for the next stage (or
5629    for the correct output, if the second stage is the last one, as in our
5630    example), we first put the output of extract_even operation and then the
5631    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5632    The input for the second stage is:
5633
5634    1st vec (E1):  0  2  4  6  8 10 12 14
5635    2nd vec (E3): 16 18 20 22 24 26 28 30
5636    3rd vec (E2):  1  3  5  7  9 11 13 15
5637    4th vec (E4): 17 19 21 23 25 27 29 31
5638
5639    The output of the second stage:
5640
5641    E1: 0 4  8 12 16 20 24 28
5642    E2: 2 6 10 14 18 22 26 30
5643    E3: 1 5  9 13 17 21 25 29
5644    E4: 3 7 11 15 19 23 27 31
5645
5646    And RESULT_CHAIN after reordering:
5647
5648    1st vec (E1):  0 4  8 12 16 20 24 28
5649    2nd vec (E3):  1 5  9 13 17 21 25 29
5650    3rd vec (E2):  2 6 10 14 18 22 26 30
5651    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5652
5653 static void
5654 vect_permute_load_chain (vec<tree> dr_chain,
5655                          unsigned int length,
5656                          stmt_vec_info stmt_info,
5657                          gimple_stmt_iterator *gsi,
5658                          vec<tree> *result_chain)
5659 {
5660   tree data_ref, first_vect, second_vect;
5661   tree perm_mask_even, perm_mask_odd;
5662   tree perm3_mask_low, perm3_mask_high;
5663   gimple *perm_stmt;
5664   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5665   unsigned int i, j, log_length = exact_log2 (length);
5666
5667   result_chain->quick_grow (length);
5668   memcpy (result_chain->address (), dr_chain.address (),
5669           length * sizeof (tree));
5670
5671   if (length == 3)
5672     {
5673       /* vect_grouped_load_supported ensures that this is constant.  */
5674       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5675       unsigned int k;
5676
5677       vec_perm_builder sel (nelt, nelt, 1);
5678       sel.quick_grow (nelt);
5679       vec_perm_indices indices;
5680       for (k = 0; k < 3; k++)
5681         {
5682           for (i = 0; i < nelt; i++)
5683             if (3 * i + k < 2 * nelt)
5684               sel[i] = 3 * i + k;
5685             else
5686               sel[i] = 0;
5687           indices.new_vector (sel, 2, nelt);
5688           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5689
5690           for (i = 0, j = 0; i < nelt; i++)
5691             if (3 * i + k < 2 * nelt)
5692               sel[i] = i;
5693             else
5694               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5695           indices.new_vector (sel, 2, nelt);
5696           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5697
5698           first_vect = dr_chain[0];
5699           second_vect = dr_chain[1];
5700
5701           /* Create interleaving stmt (low part of):
5702              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5703                                                              ...}>  */
5704           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5705           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5706                                            second_vect, perm3_mask_low);
5707           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5708
5709           /* Create interleaving stmt (high part of):
5710              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5711                                                               ...}>  */
5712           first_vect = data_ref;
5713           second_vect = dr_chain[2];
5714           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5715           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5716                                            second_vect, perm3_mask_high);
5717           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5718           (*result_chain)[k] = data_ref;
5719         }
5720     }
5721   else
5722     {
5723       /* If length is not equal to 3 then only power of 2 is supported.  */
5724       gcc_assert (pow2p_hwi (length));
5725
5726       /* The encoding has a single stepped pattern.  */
5727       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5728       vec_perm_builder sel (nelt, 1, 3);
5729       sel.quick_grow (3);
5730       for (i = 0; i < 3; ++i)
5731         sel[i] = i * 2;
5732       vec_perm_indices indices (sel, 2, nelt);
5733       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5734
5735       for (i = 0; i < 3; ++i)
5736         sel[i] = i * 2 + 1;
5737       indices.new_vector (sel, 2, nelt);
5738       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5739
5740       for (i = 0; i < log_length; i++)
5741         {
5742           for (j = 0; j < length; j += 2)
5743             {
5744               first_vect = dr_chain[j];
5745               second_vect = dr_chain[j+1];
5746
5747               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5748               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5749               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5750                                                first_vect, second_vect,
5751                                                perm_mask_even);
5752               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5753               (*result_chain)[j/2] = data_ref;
5754
5755               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5756               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5757               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5758                                                first_vect, second_vect,
5759                                                perm_mask_odd);
5760               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5761               (*result_chain)[j/2+length/2] = data_ref;
5762             }
5763           memcpy (dr_chain.address (), result_chain->address (),
5764                   length * sizeof (tree));
5765         }
5766     }
5767 }
5768
5769 /* Function vect_shift_permute_load_chain.
5770
5771    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5772    sequence of stmts to reorder the input data accordingly.
5773    Return the final references for loads in RESULT_CHAIN.
5774    Return true if successed, false otherwise.
5775
5776    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5777    The input is 3 vectors each containing 8 elements.  We assign a
5778    number to each element, the input sequence is:
5779
5780    1st vec:   0  1  2  3  4  5  6  7
5781    2nd vec:   8  9 10 11 12 13 14 15
5782    3rd vec:  16 17 18 19 20 21 22 23
5783
5784    The output sequence should be:
5785
5786    1st vec:  0 3 6  9 12 15 18 21
5787    2nd vec:  1 4 7 10 13 16 19 22
5788    3rd vec:  2 5 8 11 14 17 20 23
5789
5790    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5791
5792    First we shuffle all 3 vectors to get correct elements order:
5793
5794    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5795    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5796    3rd vec:  (16 19 22) (17 20 23) (18 21)
5797
5798    Next we unite and shift vector 3 times:
5799
5800    1st step:
5801      shift right by 6 the concatenation of:
5802      "1st vec" and  "2nd vec"
5803        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5804      "2nd vec" and  "3rd vec"
5805        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5806      "3rd vec" and  "1st vec"
5807        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5808                              | New vectors                   |
5809
5810      So that now new vectors are:
5811
5812      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5813      2nd vec:  (10 13) (16 19 22) (17 20 23)
5814      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5815
5816    2nd step:
5817      shift right by 5 the concatenation of:
5818      "1st vec" and  "3rd vec"
5819        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5820      "2nd vec" and  "1st vec"
5821        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5822      "3rd vec" and  "2nd vec"
5823        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5824                           | New vectors                   |
5825
5826      So that now new vectors are:
5827
5828      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5829      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5830      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5831
5832    3rd step:
5833      shift right by 5 the concatenation of:
5834      "1st vec" and  "1st vec"
5835        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5836      shift right by 3 the concatenation of:
5837      "2nd vec" and  "2nd vec"
5838                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5839                           | New vectors                   |
5840
5841      So that now all vectors are READY:
5842      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5843      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5844      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5845
5846    This algorithm is faster than one in vect_permute_load_chain if:
5847      1.  "shift of a concatination" is faster than general permutation.
5848          This is usually so.
5849      2.  The TARGET machine can't execute vector instructions in parallel.
5850          This is because each step of the algorithm depends on previous.
5851          The algorithm in vect_permute_load_chain is much more parallel.
5852
5853    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5854 */
5855
5856 static bool
5857 vect_shift_permute_load_chain (vec<tree> dr_chain,
5858                                unsigned int length,
5859                                stmt_vec_info stmt_info,
5860                                gimple_stmt_iterator *gsi,
5861                                vec<tree> *result_chain)
5862 {
5863   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5864   tree perm2_mask1, perm2_mask2, perm3_mask;
5865   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5866   gimple *perm_stmt;
5867
5868   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5869   unsigned int i;
5870   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5871
5872   unsigned HOST_WIDE_INT nelt, vf;
5873   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5874       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5875     /* Not supported for variable-length vectors.  */
5876     return false;
5877
5878   vec_perm_builder sel (nelt, nelt, 1);
5879   sel.quick_grow (nelt);
5880
5881   result_chain->quick_grow (length);
5882   memcpy (result_chain->address (), dr_chain.address (),
5883           length * sizeof (tree));
5884
5885   if (pow2p_hwi (length) && vf > 4)
5886     {
5887       unsigned int j, log_length = exact_log2 (length);
5888       for (i = 0; i < nelt / 2; ++i)
5889         sel[i] = i * 2;
5890       for (i = 0; i < nelt / 2; ++i)
5891         sel[nelt / 2 + i] = i * 2 + 1;
5892       vec_perm_indices indices (sel, 2, nelt);
5893       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5894         {
5895           if (dump_enabled_p ())
5896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5897                              "shuffle of 2 fields structure is not \
5898                               supported by target\n");
5899           return false;
5900         }
5901       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5902
5903       for (i = 0; i < nelt / 2; ++i)
5904         sel[i] = i * 2 + 1;
5905       for (i = 0; i < nelt / 2; ++i)
5906         sel[nelt / 2 + i] = i * 2;
5907       indices.new_vector (sel, 2, nelt);
5908       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5909         {
5910           if (dump_enabled_p ())
5911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5912                              "shuffle of 2 fields structure is not \
5913                               supported by target\n");
5914           return false;
5915         }
5916       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5917
5918       /* Generating permutation constant to shift all elements.
5919          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5920       for (i = 0; i < nelt; i++)
5921         sel[i] = nelt / 2 + i;
5922       indices.new_vector (sel, 2, nelt);
5923       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5924         {
5925           if (dump_enabled_p ())
5926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5927                              "shift permutation is not supported by target\n");
5928           return false;
5929         }
5930       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5931
5932       /* Generating permutation constant to select vector from 2.
5933          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5934       for (i = 0; i < nelt / 2; i++)
5935         sel[i] = i;
5936       for (i = nelt / 2; i < nelt; i++)
5937         sel[i] = nelt + i;
5938       indices.new_vector (sel, 2, nelt);
5939       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5940         {
5941           if (dump_enabled_p ())
5942             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5943                              "select is not supported by target\n");
5944           return false;
5945         }
5946       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5947
5948       for (i = 0; i < log_length; i++)
5949         {
5950           for (j = 0; j < length; j += 2)
5951             {
5952               first_vect = dr_chain[j];
5953               second_vect = dr_chain[j + 1];
5954
5955               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5956               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5957                                                first_vect, first_vect,
5958                                                perm2_mask1);
5959               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5960               vect[0] = data_ref;
5961
5962               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5963               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5964                                                second_vect, second_vect,
5965                                                perm2_mask2);
5966               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5967               vect[1] = data_ref;
5968
5969               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5970               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5971                                                vect[0], vect[1], shift1_mask);
5972               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5973               (*result_chain)[j/2 + length/2] = data_ref;
5974
5975               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5976               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5977                                                vect[0], vect[1], select_mask);
5978               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5979               (*result_chain)[j/2] = data_ref;
5980             }
5981           memcpy (dr_chain.address (), result_chain->address (),
5982                   length * sizeof (tree));
5983         }
5984       return true;
5985     }
5986   if (length == 3 && vf > 2)
5987     {
5988       unsigned int k = 0, l = 0;
5989
5990       /* Generating permutation constant to get all elements in rigth order.
5991          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5992       for (i = 0; i < nelt; i++)
5993         {
5994           if (3 * k + (l % 3) >= nelt)
5995             {
5996               k = 0;
5997               l += (3 - (nelt % 3));
5998             }
5999           sel[i] = 3 * k + (l % 3);
6000           k++;
6001         }
6002       vec_perm_indices indices (sel, 2, nelt);
6003       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6004         {
6005           if (dump_enabled_p ())
6006             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6007                              "shuffle of 3 fields structure is not \
6008                               supported by target\n");
6009           return false;
6010         }
6011       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6012
6013       /* Generating permutation constant to shift all elements.
6014          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6015       for (i = 0; i < nelt; i++)
6016         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6017       indices.new_vector (sel, 2, nelt);
6018       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6019         {
6020           if (dump_enabled_p ())
6021             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6022                              "shift permutation is not supported by target\n");
6023           return false;
6024         }
6025       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6026
6027       /* Generating permutation constant to shift all elements.
6028          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6029       for (i = 0; i < nelt; i++)
6030         sel[i] = 2 * (nelt / 3) + 1 + i;
6031       indices.new_vector (sel, 2, nelt);
6032       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6033         {
6034           if (dump_enabled_p ())
6035             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6036                              "shift permutation is not supported by target\n");
6037           return false;
6038         }
6039       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6040
6041       /* Generating permutation constant to shift all elements.
6042          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6043       for (i = 0; i < nelt; i++)
6044         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6045       indices.new_vector (sel, 2, nelt);
6046       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6047         {
6048           if (dump_enabled_p ())
6049             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050                              "shift permutation is not supported by target\n");
6051           return false;
6052         }
6053       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6054
6055       /* Generating permutation constant to shift all elements.
6056          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6057       for (i = 0; i < nelt; i++)
6058         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6059       indices.new_vector (sel, 2, nelt);
6060       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6061         {
6062           if (dump_enabled_p ())
6063             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6064                              "shift permutation is not supported by target\n");
6065           return false;
6066         }
6067       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6068
6069       for (k = 0; k < 3; k++)
6070         {
6071           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6072           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6073                                            dr_chain[k], dr_chain[k],
6074                                            perm3_mask);
6075           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6076           vect[k] = data_ref;
6077         }
6078
6079       for (k = 0; k < 3; k++)
6080         {
6081           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6082           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6083                                            vect[k % 3], vect[(k + 1) % 3],
6084                                            shift1_mask);
6085           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6086           vect_shift[k] = data_ref;
6087         }
6088
6089       for (k = 0; k < 3; k++)
6090         {
6091           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6092           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093                                            vect_shift[(4 - k) % 3],
6094                                            vect_shift[(3 - k) % 3],
6095                                            shift2_mask);
6096           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6097           vect[k] = data_ref;
6098         }
6099
6100       (*result_chain)[3 - (nelt % 3)] = vect[2];
6101
6102       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6103       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6104                                        vect[0], shift3_mask);
6105       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6106       (*result_chain)[nelt % 3] = data_ref;
6107
6108       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6109       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6110                                        vect[1], shift4_mask);
6111       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6112       (*result_chain)[0] = data_ref;
6113       return true;
6114     }
6115   return false;
6116 }
6117
6118 /* Function vect_transform_grouped_load.
6119
6120    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6121    to perform their permutation and ascribe the result vectorized statements to
6122    the scalar statements.
6123 */
6124
6125 void
6126 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6127                              int size, gimple_stmt_iterator *gsi)
6128 {
6129   machine_mode mode;
6130   vec<tree> result_chain = vNULL;
6131
6132   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6133      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6134      vectors, that are ready for vector computation.  */
6135   result_chain.create (size);
6136
6137   /* If reassociation width for vector type is 2 or greater target machine can
6138      execute 2 or more vector instructions in parallel.  Otherwise try to
6139      get chain for loads group using vect_shift_permute_load_chain.  */
6140   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6141   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6142       || pow2p_hwi (size)
6143       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6144                                          gsi, &result_chain))
6145     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6146   vect_record_grouped_load_vectors (stmt_info, result_chain);
6147   result_chain.release ();
6148 }
6149
6150 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6151    generated as part of the vectorization of STMT_INFO.  Assign the statement
6152    for each vector to the associated scalar statement.  */
6153
6154 void
6155 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6156                                   vec<tree> result_chain)
6157 {
6158   vec_info *vinfo = stmt_info->vinfo;
6159   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6160   unsigned int i, gap_count;
6161   tree tmp_data_ref;
6162
6163   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6164      Since we scan the chain starting from it's first node, their order
6165      corresponds the order of data-refs in RESULT_CHAIN.  */
6166   stmt_vec_info next_stmt_info = first_stmt_info;
6167   gap_count = 1;
6168   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6169     {
6170       if (!next_stmt_info)
6171         break;
6172
6173       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6174        code elimination pass later.  No need to check for the first stmt in
6175        the group, since it always exists.
6176        DR_GROUP_GAP is the number of steps in elements from the previous
6177        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6178        correspond to the gaps.  */
6179       if (next_stmt_info != first_stmt_info
6180           && gap_count < DR_GROUP_GAP (next_stmt_info))
6181       {
6182         gap_count++;
6183         continue;
6184       }
6185
6186       while (next_stmt_info)
6187         {
6188           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6189           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6190              copies, and we put the new vector statement in the first available
6191              RELATED_STMT.  */
6192           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6193             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6194           else
6195             {
6196               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6197                 {
6198                   stmt_vec_info prev_stmt_info
6199                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6200                   stmt_vec_info rel_stmt_info
6201                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6202                   while (rel_stmt_info)
6203                     {
6204                       prev_stmt_info = rel_stmt_info;
6205                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6206                     }
6207
6208                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6209                 }
6210             }
6211
6212           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6213           gap_count = 1;
6214           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6215              put the same TMP_DATA_REF as its vectorized statement; otherwise
6216              get the next data-ref from RESULT_CHAIN.  */
6217           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6218             break;
6219         }
6220     }
6221 }
6222
6223 /* Function vect_force_dr_alignment_p.
6224
6225    Returns whether the alignment of a DECL can be forced to be aligned
6226    on ALIGNMENT bit boundary.  */
6227
6228 bool
6229 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6230 {
6231   if (!VAR_P (decl))
6232     return false;
6233
6234   if (decl_in_symtab_p (decl)
6235       && !symtab_node::get (decl)->can_increase_alignment_p ())
6236     return false;
6237
6238   if (TREE_STATIC (decl))
6239     return (alignment <= MAX_OFILE_ALIGNMENT);
6240   else
6241     return (alignment <= MAX_STACK_ALIGNMENT);
6242 }
6243
6244
6245 /* Return whether the data reference DR_INFO is supported with respect to its
6246    alignment.
6247    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6248    it is aligned, i.e., check if it is possible to vectorize it with different
6249    alignment.  */
6250
6251 enum dr_alignment_support
6252 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6253                                bool check_aligned_accesses)
6254 {
6255   data_reference *dr = dr_info->dr;
6256   stmt_vec_info stmt_info = dr_info->stmt;
6257   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6258   machine_mode mode = TYPE_MODE (vectype);
6259   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6260   struct loop *vect_loop = NULL;
6261   bool nested_in_vect_loop = false;
6262
6263   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6264     return dr_aligned;
6265
6266   /* For now assume all conditional loads/stores support unaligned
6267      access without any special code.  */
6268   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6269     if (gimple_call_internal_p (stmt)
6270         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6271             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6272       return dr_unaligned_supported;
6273
6274   if (loop_vinfo)
6275     {
6276       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6277       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6278     }
6279
6280   /* Possibly unaligned access.  */
6281
6282   /* We can choose between using the implicit realignment scheme (generating
6283      a misaligned_move stmt) and the explicit realignment scheme (generating
6284      aligned loads with a REALIGN_LOAD).  There are two variants to the
6285      explicit realignment scheme: optimized, and unoptimized.
6286      We can optimize the realignment only if the step between consecutive
6287      vector loads is equal to the vector size.  Since the vector memory
6288      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6289      is guaranteed that the misalignment amount remains the same throughout the
6290      execution of the vectorized loop.  Therefore, we can create the
6291      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6292      at the loop preheader.
6293
6294      However, in the case of outer-loop vectorization, when vectorizing a
6295      memory access in the inner-loop nested within the LOOP that is now being
6296      vectorized, while it is guaranteed that the misalignment of the
6297      vectorized memory access will remain the same in different outer-loop
6298      iterations, it is *not* guaranteed that is will remain the same throughout
6299      the execution of the inner-loop.  This is because the inner-loop advances
6300      with the original scalar step (and not in steps of VS).  If the inner-loop
6301      step happens to be a multiple of VS, then the misalignment remains fixed
6302      and we can use the optimized realignment scheme.  For example:
6303
6304       for (i=0; i<N; i++)
6305         for (j=0; j<M; j++)
6306           s += a[i+j];
6307
6308      When vectorizing the i-loop in the above example, the step between
6309      consecutive vector loads is 1, and so the misalignment does not remain
6310      fixed across the execution of the inner-loop, and the realignment cannot
6311      be optimized (as illustrated in the following pseudo vectorized loop):
6312
6313       for (i=0; i<N; i+=4)
6314         for (j=0; j<M; j++){
6315           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6316                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6317                          // (assuming that we start from an aligned address).
6318           }
6319
6320      We therefore have to use the unoptimized realignment scheme:
6321
6322       for (i=0; i<N; i+=4)
6323           for (j=k; j<M; j+=4)
6324           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6325                            // that the misalignment of the initial address is
6326                            // 0).
6327
6328      The loop can then be vectorized as follows:
6329
6330       for (k=0; k<4; k++){
6331         rt = get_realignment_token (&vp[k]);
6332         for (i=0; i<N; i+=4){
6333           v1 = vp[i+k];
6334           for (j=k; j<M; j+=4){
6335             v2 = vp[i+j+VS-1];
6336             va = REALIGN_LOAD <v1,v2,rt>;
6337             vs += va;
6338             v1 = v2;
6339           }
6340         }
6341     } */
6342
6343   if (DR_IS_READ (dr))
6344     {
6345       bool is_packed = false;
6346       tree type = (TREE_TYPE (DR_REF (dr)));
6347
6348       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6349           && (!targetm.vectorize.builtin_mask_for_load
6350               || targetm.vectorize.builtin_mask_for_load ()))
6351         {
6352           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6353
6354           /* If we are doing SLP then the accesses need not have the
6355              same alignment, instead it depends on the SLP group size.  */
6356           if (loop_vinfo
6357               && STMT_SLP_TYPE (stmt_info)
6358               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6359                               * (DR_GROUP_SIZE
6360                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6361                               TYPE_VECTOR_SUBPARTS (vectype)))
6362             ;
6363           else if (!loop_vinfo
6364                    || (nested_in_vect_loop
6365                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6366                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6367             return dr_explicit_realign;
6368           else
6369             return dr_explicit_realign_optimized;
6370         }
6371       if (!known_alignment_for_access_p (dr_info))
6372         is_packed = not_size_aligned (DR_REF (dr));
6373
6374       if (targetm.vectorize.support_vector_misalignment
6375             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6376         /* Can't software pipeline the loads, but can at least do them.  */
6377         return dr_unaligned_supported;
6378     }
6379   else
6380     {
6381       bool is_packed = false;
6382       tree type = (TREE_TYPE (DR_REF (dr)));
6383
6384       if (!known_alignment_for_access_p (dr_info))
6385         is_packed = not_size_aligned (DR_REF (dr));
6386
6387      if (targetm.vectorize.support_vector_misalignment
6388            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6389        return dr_unaligned_supported;
6390     }
6391
6392   /* Unsupported.  */
6393   return dr_unaligned_unsupported;
6394 }