gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT_INFO.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 121                                HOST_WIDE_INT *lhs_size_unit,
 122                                HOST_WIDE_INT *rhs_size_unit)
 123 {
 124   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 125   HOST_WIDE_INT lhs, rhs;
 126
 127   /* During the analysis phase, this function is called on arbitrary
 128      statements that might not have scalar results.  */
 129   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 130     return scalar_type;
 131
 132   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 133
 134   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 135   if (assign
 136       && (gimple_assign_cast_p (assign)
 137           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 142     {
 143       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146       if (rhs < lhs)
 147         scalar_type = rhs_type;
 148     }
 149
 150   *lhs_size_unit = lhs;
 151   *rhs_size_unit = rhs;
 152   return scalar_type;
 153 }
 154
 155
 156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 157    tested at run-time.  Return TRUE if DDR was successfully inserted.
 158    Return false if versioning is not supported.  */
 159
 160 static bool
 161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 162 {
 163   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 164
 165   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 166     return false;
 167
 168   if (!runtime_alias_check_p (ddr, loop,
 169                               optimize_loop_nest_for_speed_p (loop)))
 170     return false;
 171
 172   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 173   return true;
 174 }
 175
 176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 177
 178 static void
 179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 180 {
 181   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 182   for (unsigned int i = 0; i < checks.length(); ++i)
 183     if (checks[i] == value)
 184       return;
 185
 186   if (dump_enabled_p ())
 187     {
 188       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 189       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 190       dump_printf (MSG_NOTE, " is nonzero\n");
 191     }
 192   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 193 }
 194
 195 /* Return true if we know that the order of vectorized DR_INFO_A and
 196    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 197    DR_INFO_B.  At least one of the accesses is a write.  */
 198
 199 static bool
 200 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 201 {
 202   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 203   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 204
 205   /* Single statements are always kept in their original order.  */
 206   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 207       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 208     return true;
 209
 210   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 211      group are emitted at the position of the first scalar load and all
 212      stores in a group are emitted at the position of the last scalar store.
 213      Thus writes will happen no earlier than their current position
 214      (but could happen later) while reads will happen no later than their
 215      current position (but could happen earlier).  Reordering is therefore
 216      only possible if the first access is a write.  */
 217   if (is_pattern_stmt_p (stmtinfo_a))
 218     stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 219   if (is_pattern_stmt_p (stmtinfo_b))
 220     stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 221   stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 222   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
 223 }
 224
 225 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 226    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 227    distances.  These distances are conservatively correct but they don't
 228    reflect a guaranteed dependence.
 229
 230    Return true if this function does all the work necessary to avoid
 231    an alias or false if the caller should use the dependence distances
 232    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 233    the depth of the loop described by LOOP_VINFO and the other arguments
 234    are as for vect_analyze_data_ref_dependence.  */
 235
 236 static bool
 237 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 238                                        loop_vec_info loop_vinfo,
 239                                        int loop_depth, unsigned int *max_vf)
 240 {
 241   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 242   lambda_vector dist_v;
 243   unsigned int i;
 244   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 245     {
 246       int dist = dist_v[loop_depth];
 247       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 248         {
 249           /* If the user asserted safelen >= DIST consecutive iterations
 250              can be executed concurrently, assume independence.
 251
 252              ??? An alternative would be to add the alias check even
 253              in this case, and vectorize the fallback loop with the
 254              maximum VF set to safelen.  However, if the user has
 255              explicitly given a length, it's less likely that that
 256              would be a win.  */
 257           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 258             {
 259               if ((unsigned int) loop->safelen < *max_vf)
 260                 *max_vf = loop->safelen;
 261               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 262               continue;
 263             }
 264
 265           /* For dependence distances of 2 or more, we have the option
 266              of limiting VF or checking for an alias at runtime.
 267              Prefer to check at runtime if we can, to avoid limiting
 268              the VF unnecessarily when the bases are in fact independent.
 269
 270              Note that the alias checks will be removed if the VF ends up
 271              being small enough.  */
 272           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 273           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 274           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 275                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 276                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 277         }
 278     }
 279   return true;
 280 }
 281
 282
 283 /* Function vect_analyze_data_ref_dependence.
 284
 285    Return TRUE if there (might) exist a dependence between a memory-reference
 286    DRA and a memory-reference DRB.  When versioning for alias may check a
 287    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 288    the data dependence.  */
 289
 290 static bool
 291 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 292                                   loop_vec_info loop_vinfo,
 293                                   unsigned int *max_vf)
 294 {
 295   unsigned int i;
 296   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 297   struct data_reference *dra = DDR_A (ddr);
 298   struct data_reference *drb = DDR_B (ddr);
 299   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 300   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 301   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 302   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 303   lambda_vector dist_v;
 304   unsigned int loop_depth;
 305
 306   /* In loop analysis all data references should be vectorizable.  */
 307   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 308       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 309     gcc_unreachable ();
 310
 311   /* Independent data accesses.  */
 312   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 313     return false;
 314
 315   if (dra == drb
 316       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 317     return false;
 318
 319   /* We do not have to consider dependences between accesses that belong
 320      to the same group, unless the stride could be smaller than the
 321      group size.  */
 322   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 323       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 324           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 325       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 326     return false;
 327
 328   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 329      least two scalar iterations, there is always also a true dependence.
 330      As the vectorizer does not re-order loads and stores we can ignore
 331      the anti-dependence if TBAA can disambiguate both DRs similar to the
 332      case with known negative distance anti-dependences (positive
 333      distance anti-dependences would violate TBAA constraints).  */
 334   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 335        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 336       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 337                                  get_alias_set (DR_REF (drb))))
 338     return false;
 339
 340   /* Unknown data dependence.  */
 341   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 342     {
 343       /* If user asserted safelen consecutive iterations can be
 344          executed concurrently, assume independence.  */
 345       if (loop->safelen >= 2)
 346         {
 347           if ((unsigned int) loop->safelen < *max_vf)
 348             *max_vf = loop->safelen;
 349           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 350           return false;
 351         }
 352
 353       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 354           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 355         {
 356           if (dump_enabled_p ())
 357             {
 358               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 359                                "versioning for alias not supported for: "
 360                                "can't determine dependence between ");
 361               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 362                                  DR_REF (dra));
 363               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 364               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 365                                  DR_REF (drb));
 366               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 367             }
 368           return true;
 369         }
 370
 371       if (dump_enabled_p ())
 372         {
 373           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 374                            "versioning for alias required: "
 375                            "can't determine dependence between ");
 376           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 377                              DR_REF (dra));
 378           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 379           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 380                              DR_REF (drb));
 381           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 382         }
 383
 384       /* Add to list of ddrs that need to be tested at run-time.  */
 385       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 386     }
 387
 388   /* Known data dependence.  */
 389   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 390     {
 391       /* If user asserted safelen consecutive iterations can be
 392          executed concurrently, assume independence.  */
 393       if (loop->safelen >= 2)
 394         {
 395           if ((unsigned int) loop->safelen < *max_vf)
 396             *max_vf = loop->safelen;
 397           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 398           return false;
 399         }
 400
 401       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 402           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 403         {
 404           if (dump_enabled_p ())
 405             {
 406               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                "versioning for alias not supported for: "
 408                                "bad dist vector for ");
 409               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 410                                  DR_REF (dra));
 411               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 412               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 413                                  DR_REF (drb));
 414               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 415             }
 416           return true;
 417         }
 418
 419       if (dump_enabled_p ())
 420         {
 421           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                            "versioning for alias required: "
 423                            "bad dist vector for ");
 424           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 425           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 426           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 427           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 428         }
 429       /* Add to list of ddrs that need to be tested at run-time.  */
 430       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 431     }
 432
 433   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 434
 435   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 436       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 437                                                 loop_depth, max_vf))
 438     return false;
 439
 440   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 441     {
 442       int dist = dist_v[loop_depth];
 443
 444       if (dump_enabled_p ())
 445         dump_printf_loc (MSG_NOTE, vect_location,
 446                          "dependence distance  = %d.\n", dist);
 447
 448       if (dist == 0)
 449         {
 450           if (dump_enabled_p ())
 451             {
 452               dump_printf_loc (MSG_NOTE, vect_location,
 453                                "dependence distance == 0 between ");
 454               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 455               dump_printf (MSG_NOTE, " and ");
 456               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 457               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 458             }
 459
 460           /* When we perform grouped accesses and perform implicit CSE
 461              by detecting equal accesses and doing disambiguation with
 462              runtime alias tests like for
 463                 .. = a[i];
 464                 .. = a[i+1];
 465                 a[i] = ..;
 466                 a[i+1] = ..;
 467                 *p = ..;
 468                 .. = a[i];
 469                 .. = a[i+1];
 470              where we will end up loading { a[i], a[i+1] } once, make
 471              sure that inserting group loads before the first load and
 472              stores after the last store will do the right thing.
 473              Similar for groups like
 474                 a[i] = ...;
 475                 ... = a[i];
 476                 a[i+1] = ...;
 477              where loads from the group interleave with the store.  */
 478           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 479             {
 480               if (dump_enabled_p ())
 481                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 482                                  "READ_WRITE dependence in interleaving.\n");
 483               return true;
 484             }
 485
 486           if (loop->safelen < 2)
 487             {
 488               tree indicator = dr_zero_step_indicator (dra);
 489               if (!indicator || integer_zerop (indicator))
 490                 {
 491                   if (dump_enabled_p ())
 492                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 493                                  "access also has a zero step\n");
 494                   return true;
 495                 }
 496               else if (TREE_CODE (indicator) != INTEGER_CST)
 497                 vect_check_nonzero_value (loop_vinfo, indicator);
 498             }
 499           continue;
 500         }
 501
 502       if (dist > 0 && DDR_REVERSED_P (ddr))
 503         {
 504           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 505              reversed (to make distance vector positive), and the actual
 506              distance is negative.  */
 507           if (dump_enabled_p ())
 508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 509                              "dependence distance negative.\n");
 510           /* Record a negative dependence distance to later limit the
 511              amount of stmt copying / unrolling we can perform.
 512              Only need to handle read-after-write dependence.  */
 513           if (DR_IS_READ (drb)
 514               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 515                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 516             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 517           continue;
 518         }
 519
 520       unsigned int abs_dist = abs (dist);
 521       if (abs_dist >= 2 && abs_dist < *max_vf)
 522         {
 523           /* The dependence distance requires reduction of the maximal
 524              vectorization factor.  */
 525           *max_vf = abs (dist);
 526           if (dump_enabled_p ())
 527             dump_printf_loc (MSG_NOTE, vect_location,
 528                              "adjusting maximal vectorization factor to %i\n",
 529                              *max_vf);
 530         }
 531
 532       if (abs_dist >= *max_vf)
 533         {
 534           /* Dependence distance does not create dependence, as far as
 535              vectorization is concerned, in this case.  */
 536           if (dump_enabled_p ())
 537             dump_printf_loc (MSG_NOTE, vect_location,
 538                              "dependence distance >= VF.\n");
 539           continue;
 540         }
 541
 542       if (dump_enabled_p ())
 543         {
 544           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 545                        "not vectorized, possible dependence "
 546                        "between data-refs ");
 547           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 548           dump_printf (MSG_NOTE,  " and ");
 549           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 550           dump_printf (MSG_NOTE,  "\n");
 551         }
 552
 553       return true;
 554     }
 555
 556   return false;
 557 }
 558
 559 /* Function vect_analyze_data_ref_dependences.
 560
 561    Examine all the data references in the loop, and make sure there do not
 562    exist any data dependences between them.  Set *MAX_VF according to
 563    the maximum vectorization factor the data dependences allow.  */
 564
 565 bool
 566 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 567                                    unsigned int *max_vf)
 568 {
 569   unsigned int i;
 570   struct data_dependence_relation *ddr;
 571
 572   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 573
 574   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 575     {
 576       LOOP_VINFO_DDRS (loop_vinfo)
 577         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 578                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 579       /* We need read-read dependences to compute
 580          STMT_VINFO_SAME_ALIGN_REFS.  */
 581       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 582                                           &LOOP_VINFO_DDRS (loop_vinfo),
 583                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 584                                           true);
 585       gcc_assert (res);
 586     }
 587
 588   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 589
 590   /* For epilogues we either have no aliases or alias versioning
 591      was applied to original loop.  Therefore we may just get max_vf
 592      using VF of original loop.  */
 593   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 594     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 595   else
 596     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 597       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 598         return false;
 599
 600   return true;
 601 }
 602
 603
 604 /* Function vect_slp_analyze_data_ref_dependence.
 605
 606    Return TRUE if there (might) exist a dependence between a memory-reference
 607    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 608    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 609    according to the data dependence.  */
 610
 611 static bool
 612 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 613                                       struct data_dependence_relation *ddr)
 614 {
 615   struct data_reference *dra = DDR_A (ddr);
 616   struct data_reference *drb = DDR_B (ddr);
 617   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 618   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 619
 620   /* We need to check dependences of statements marked as unvectorizable
 621      as well, they still can prohibit vectorization.  */
 622
 623   /* Independent data accesses.  */
 624   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 625     return false;
 626
 627   if (dra == drb)
 628     return false;
 629
 630   /* Read-read is OK.  */
 631   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 632     return false;
 633
 634   /* If dra and drb are part of the same interleaving chain consider
 635      them independent.  */
 636   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 637       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 638           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 639     return false;
 640
 641   /* Unknown data dependence.  */
 642   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 643     {
 644       if  (dump_enabled_p ())
 645         {
 646           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 647                            "can't determine dependence between ");
 648           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 649           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 650           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 651           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 652         }
 653     }
 654   else if (dump_enabled_p ())
 655     {
 656       dump_printf_loc (MSG_NOTE, vect_location,
 657                        "determined dependence between ");
 658       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 659       dump_printf (MSG_NOTE, " and ");
 660       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 661       dump_printf (MSG_NOTE,  "\n");
 662     }
 663
 664   return true;
 665 }
 666
 667
 668 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 669    contain the vector of scalar stores of this instance if we are
 670    disambiguating the loads.  */
 671
 672 static bool
 673 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 674                                    vec<stmt_vec_info> stores,
 675                                    stmt_vec_info last_store_info)
 676 {
 677   /* This walks over all stmts involved in the SLP load/store done
 678      in NODE verifying we can sink them up to the last stmt in the
 679      group.  */
 680   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 681   vec_info *vinfo = last_access_info->vinfo;
 682   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 683     {
 684       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 685       if (access_info == last_access_info)
 686         continue;
 687       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 688       ao_ref ref;
 689       bool ref_initialized_p = false;
 690       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 691            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 692         {
 693           gimple *stmt = gsi_stmt (gsi);
 694           if (! gimple_vuse (stmt)
 695               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 696             continue;
 697
 698           /* If we couldn't record a (single) data reference for this
 699              stmt we have to resort to the alias oracle.  */
 700           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 701           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 702           if (!dr_b)
 703             {
 704               /* We are moving a store or sinking a load - this means
 705                  we cannot use TBAA for disambiguation.  */
 706               if (!ref_initialized_p)
 707                 ao_ref_init (&ref, DR_REF (dr_a));
 708               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 709                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 710                 return false;
 711               continue;
 712             }
 713
 714           bool dependent = false;
 715           /* If we run into a store of this same instance (we've just
 716              marked those) then delay dependence checking until we run
 717              into the last store because this is where it will have
 718              been sunk to (and we verify if we can do that as well).  */
 719           if (gimple_visited_p (stmt))
 720             {
 721               if (stmt_info != last_store_info)
 722                 continue;
 723               unsigned i;
 724               stmt_vec_info store_info;
 725               FOR_EACH_VEC_ELT (stores, i, store_info)
 726                 {
 727                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 728                   ddr_p ddr = initialize_data_dependence_relation
 729                                 (dr_a, store_dr, vNULL);
 730                   dependent
 731                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 732                   free_dependence_relation (ddr);
 733                   if (dependent)
 734                     break;
 735                 }
 736             }
 737           else
 738             {
 739               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 740                                                                dr_b, vNULL);
 741               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 742               free_dependence_relation (ddr);
 743             }
 744           if (dependent)
 745             return false;
 746         }
 747     }
 748   return true;
 749 }
 750
 751
 752 /* Function vect_analyze_data_ref_dependences.
 753
 754    Examine all the data references in the basic-block, and make sure there
 755    do not exist any data dependences between them.  Set *MAX_VF according to
 756    the maximum vectorization factor the data dependences allow.  */
 757
 758 bool
 759 vect_slp_analyze_instance_dependence (slp_instance instance)
 760 {
 761   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 762
 763   /* The stores of this instance are at the root of the SLP tree.  */
 764   slp_tree store = SLP_INSTANCE_TREE (instance);
 765   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 766     store = NULL;
 767
 768   /* Verify we can sink stores to the vectorized stmt insert location.  */
 769   stmt_vec_info last_store_info = NULL;
 770   if (store)
 771     {
 772       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 773         return false;
 774
 775       /* Mark stores in this instance and remember the last one.  */
 776       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 777       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 778         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 779     }
 780
 781   bool res = true;
 782
 783   /* Verify we can sink loads to the vectorized stmt insert location,
 784      special-casing stores of this instance.  */
 785   slp_tree load;
 786   unsigned int i;
 787   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 788     if (! vect_slp_analyze_node_dependences (instance, load,
 789                                              store
 790                                              ? SLP_TREE_SCALAR_STMTS (store)
 791                                              : vNULL, last_store_info))
 792       {
 793         res = false;
 794         break;
 795       }
 796
 797   /* Unset the visited flag.  */
 798   if (store)
 799     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 800       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 801
 802   return res;
 803 }
 804
 805 /* Record the base alignment guarantee given by DRB, which occurs
 806    in STMT_INFO.  */
 807
 808 static void
 809 vect_record_base_alignment (stmt_vec_info stmt_info,
 810                             innermost_loop_behavior *drb)
 811 {
 812   vec_info *vinfo = stmt_info->vinfo;
 813   bool existed;
 814   innermost_loop_behavior *&entry
 815     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 816   if (!existed || entry->base_alignment < drb->base_alignment)
 817     {
 818       entry = drb;
 819       if (dump_enabled_p ())
 820         {
 821           dump_printf_loc (MSG_NOTE, vect_location,
 822                            "recording new base alignment for ");
 823           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 824           dump_printf (MSG_NOTE, "\n");
 825           dump_printf_loc (MSG_NOTE, vect_location,
 826                            "  alignment:    %d\n", drb->base_alignment);
 827           dump_printf_loc (MSG_NOTE, vect_location,
 828                            "  misalignment: %d\n", drb->base_misalignment);
 829           dump_printf_loc (MSG_NOTE, vect_location,
 830                            "  based on:     ");
 831           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 832         }
 833     }
 834 }
 835
 836 /* If the region we're going to vectorize is reached, all unconditional
 837    data references occur at least once.  We can therefore pool the base
 838    alignment guarantees from each unconditional reference.  Do this by
 839    going through all the data references in VINFO and checking whether
 840    the containing statement makes the reference unconditionally.  If so,
 841    record the alignment of the base address in VINFO so that it can be
 842    used for all other references with the same base.  */
 843
 844 void
 845 vect_record_base_alignments (vec_info *vinfo)
 846 {
 847   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 848   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 849   data_reference *dr;
 850   unsigned int i;
 851   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 852     {
 853       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 854       stmt_vec_info stmt_info = dr_info->stmt;
 855       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 856           && STMT_VINFO_VECTORIZABLE (stmt_info)
 857           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 858         {
 859           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 860
 861           /* If DR is nested in the loop that is being vectorized, we can also
 862              record the alignment of the base wrt the outer loop.  */
 863           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 864             vect_record_base_alignment
 865               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 866         }
 867     }
 868 }
 869
 870 /* Return the target alignment for the vectorized form of DR_INFO.  */
 871
 872 static unsigned int
 873 vect_calculate_target_alignment (dr_vec_info *dr_info)
 874 {
 875   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 876   return targetm.vectorize.preferred_vector_alignment (vectype);
 877 }
 878
 879 /* Function vect_compute_data_ref_alignment
 880
 881    Compute the misalignment of the data reference DR_INFO.
 882
 883    Output:
 884    1. DR_MISALIGNMENT (DR_INFO) is defined.
 885
 886    FOR NOW: No analysis is actually performed. Misalignment is calculated
 887    only for trivial cases. TODO.  */
 888
 889 static void
 890 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 891 {
 892   stmt_vec_info stmt_info = dr_info->stmt;
 893   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 894   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 895   struct loop *loop = NULL;
 896   tree ref = DR_REF (dr_info->dr);
 897   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 898
 899   if (dump_enabled_p ())
 900     dump_printf_loc (MSG_NOTE, vect_location,
 901                      "vect_compute_data_ref_alignment:\n");
 902
 903   if (loop_vinfo)
 904     loop = LOOP_VINFO_LOOP (loop_vinfo);
 905
 906   /* Initialize misalignment to unknown.  */
 907   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 908
 909   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 910     return;
 911
 912   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 913   bool step_preserves_misalignment_p;
 914
 915   unsigned HOST_WIDE_INT vector_alignment
 916     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 917   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 918
 919   /* No step for BB vectorization.  */
 920   if (!loop)
 921     {
 922       gcc_assert (integer_zerop (drb->step));
 923       step_preserves_misalignment_p = true;
 924     }
 925
 926   /* In case the dataref is in an inner-loop of the loop that is being
 927      vectorized (LOOP), we use the base and misalignment information
 928      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 929      stays the same throughout the execution of the inner-loop, which is why
 930      we have to check that the stride of the dataref in the inner-loop evenly
 931      divides by the vector alignment.  */
 932   else if (nested_in_vect_loop_p (loop, stmt_info))
 933     {
 934       step_preserves_misalignment_p
 935         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 936
 937       if (dump_enabled_p ())
 938         {
 939           if (step_preserves_misalignment_p)
 940             dump_printf_loc (MSG_NOTE, vect_location,
 941                              "inner step divides the vector alignment.\n");
 942           else
 943             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 944                              "inner step doesn't divide the vector"
 945                              " alignment.\n");
 946         }
 947     }
 948
 949   /* Similarly we can only use base and misalignment information relative to
 950      an innermost loop if the misalignment stays the same throughout the
 951      execution of the loop.  As above, this is the case if the stride of
 952      the dataref evenly divides by the alignment.  */
 953   else
 954     {
 955       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 956       step_preserves_misalignment_p
 957         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 958
 959       if (!step_preserves_misalignment_p && dump_enabled_p ())
 960         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 961                          "step doesn't divide the vector alignment.\n");
 962     }
 963
 964   unsigned int base_alignment = drb->base_alignment;
 965   unsigned int base_misalignment = drb->base_misalignment;
 966
 967   /* Calculate the maximum of the pooled base address alignment and the
 968      alignment that we can compute for DR itself.  */
 969   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 970   if (entry && base_alignment < (*entry)->base_alignment)
 971     {
 972       base_alignment = (*entry)->base_alignment;
 973       base_misalignment = (*entry)->base_misalignment;
 974     }
 975
 976   if (drb->offset_alignment < vector_alignment
 977       || !step_preserves_misalignment_p
 978       /* We need to know whether the step wrt the vectorized loop is
 979          negative when computing the starting misalignment below.  */
 980       || TREE_CODE (drb->step) != INTEGER_CST)
 981     {
 982       if (dump_enabled_p ())
 983         {
 984           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 985                            "Unknown alignment for access: ");
 986           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 987           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 988         }
 989       return;
 990     }
 991
 992   if (base_alignment < vector_alignment)
 993     {
 994       unsigned int max_alignment;
 995       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 996       if (max_alignment < vector_alignment
 997           || !vect_can_force_dr_alignment_p (base,
 998                                              vector_alignment * BITS_PER_UNIT))
 999         {
1000           if (dump_enabled_p ())
1001             {
1002               dump_printf_loc (MSG_NOTE, vect_location,
1003                                "can't force alignment of ref: ");
1004               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005               dump_printf (MSG_NOTE, "\n");
1006             }
1007           return;
1008         }
1009
1010       /* Force the alignment of the decl.
1011          NOTE: This is the only change to the code we make during
1012          the analysis phase, before deciding to vectorize the loop.  */
1013       if (dump_enabled_p ())
1014         {
1015           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1016           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1017           dump_printf (MSG_NOTE, "\n");
1018         }
1019
1020       dr_info->base_decl = base;
1021       dr_info->base_misaligned = true;
1022       base_misalignment = 0;
1023     }
1024   poly_int64 misalignment
1025     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1026
1027   /* If this is a backward running DR then first access in the larger
1028      vectype actually is N-1 elements before the address in the DR.
1029      Adjust misalign accordingly.  */
1030   if (tree_int_cst_sgn (drb->step) < 0)
1031     /* PLUS because STEP is negative.  */
1032     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1033                      * TREE_INT_CST_LOW (drb->step));
1034
1035   unsigned int const_misalignment;
1036   if (!known_misalignment (misalignment, vector_alignment,
1037                            &const_misalignment))
1038     {
1039       if (dump_enabled_p ())
1040         {
1041           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042                            "Non-constant misalignment for access: ");
1043           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045         }
1046       return;
1047     }
1048
1049   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1050
1051   if (dump_enabled_p ())
1052     {
1053       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054                        "misalign = %d bytes of ref ",
1055                        DR_MISALIGNMENT (dr_info));
1056       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1057       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1058     }
1059
1060   return;
1061 }
1062
1063 /* Function vect_update_misalignment_for_peel.
1064    Sets DR_INFO's misalignment
1065    - to 0 if it has the same alignment as DR_PEEL_INFO,
1066    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1067    - to -1 (unknown) otherwise.
1068
1069    DR_INFO - the data reference whose misalignment is to be adjusted.
1070    DR_PEEL_INFO - the data reference whose misalignment is being made
1071                   zero in the vector loop by the peel.
1072    NPEEL - the number of iterations in the peel loop if the misalignment
1073            of DR_PEEL_INFO is known at compile time.  */
1074
1075 static void
1076 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1077                                    dr_vec_info *dr_peel_info, int npeel)
1078 {
1079   unsigned int i;
1080   vec<dr_p> same_aligned_drs;
1081   struct data_reference *current_dr;
1082   int dr_size = vect_get_scalar_dr_size (dr_info);
1083   int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1084   stmt_vec_info stmt_info = dr_info->stmt;
1085   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1086
1087  /* For interleaved data accesses the step in the loop must be multiplied by
1088      the size of the interleaving group.  */
1089   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1090     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1091   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1092     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1093
1094   /* It can be assumed that the data refs with the same alignment as dr_peel
1095      are aligned in the vector loop.  */
1096   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1097   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1098     {
1099       if (current_dr != dr_info->dr)
1100         continue;
1101       gcc_assert (!known_alignment_for_access_p (dr_info)
1102                   || !known_alignment_for_access_p (dr_peel_info)
1103                   || (DR_MISALIGNMENT (dr_info) / dr_size
1104                       == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1105       SET_DR_MISALIGNMENT (dr_info, 0);
1106       return;
1107     }
1108
1109   if (known_alignment_for_access_p (dr_info)
1110       && known_alignment_for_access_p (dr_peel_info))
1111     {
1112       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1113                                             size_zero_node) < 0;
1114       int misal = DR_MISALIGNMENT (dr_info);
1115       misal += negative ? -npeel * dr_size : npeel * dr_size;
1116       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1117       SET_DR_MISALIGNMENT (dr_info, misal);
1118       return;
1119     }
1120
1121   if (dump_enabled_p ())
1122     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1123                      "to unknown (-1).\n");
1124   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1125 }
1126
1127
1128 /* Function verify_data_ref_alignment
1129
1130    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1131
1132 static bool
1133 verify_data_ref_alignment (dr_vec_info *dr_info)
1134 {
1135   enum dr_alignment_support supportable_dr_alignment
1136     = vect_supportable_dr_alignment (dr_info, false);
1137   if (!supportable_dr_alignment)
1138     {
1139       if (dump_enabled_p ())
1140         {
1141           if (DR_IS_READ (dr_info->dr))
1142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143                              "not vectorized: unsupported unaligned load.");
1144           else
1145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146                              "not vectorized: unsupported unaligned "
1147                              "store.");
1148
1149           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1150                              DR_REF (dr_info->dr));
1151           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1152         }
1153       return false;
1154     }
1155
1156   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1157     dump_printf_loc (MSG_NOTE, vect_location,
1158                      "Vectorizing an unaligned access.\n");
1159
1160   return true;
1161 }
1162
1163 /* Function vect_verify_datarefs_alignment
1164
1165    Return TRUE if all data references in the loop can be
1166    handled with respect to alignment.  */
1167
1168 bool
1169 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1170 {
1171   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1172   struct data_reference *dr;
1173   unsigned int i;
1174
1175   FOR_EACH_VEC_ELT (datarefs, i, dr)
1176     {
1177       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1178       stmt_vec_info stmt_info = dr_info->stmt;
1179
1180       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1181         continue;
1182
1183       /* For interleaving, only the alignment of the first access matters.   */
1184       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1185           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1186         continue;
1187
1188       /* Strided accesses perform only component accesses, alignment is
1189          irrelevant for them.  */
1190       if (STMT_VINFO_STRIDED_P (stmt_info)
1191           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1192         continue;
1193
1194       if (! verify_data_ref_alignment (dr_info))
1195         return false;
1196     }
1197
1198   return true;
1199 }
1200
1201 /* Given an memory reference EXP return whether its alignment is less
1202    than its size.  */
1203
1204 static bool
1205 not_size_aligned (tree exp)
1206 {
1207   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1208     return true;
1209
1210   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1211           > get_object_alignment (exp));
1212 }
1213
1214 /* Function vector_alignment_reachable_p
1215
1216    Return true if vector alignment for DR_INFO is reachable by peeling
1217    a few loop iterations.  Return false otherwise.  */
1218
1219 static bool
1220 vector_alignment_reachable_p (dr_vec_info *dr_info)
1221 {
1222   stmt_vec_info stmt_info = dr_info->stmt;
1223   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1224
1225   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1226     {
1227       /* For interleaved access we peel only if number of iterations in
1228          the prolog loop ({VF - misalignment}), is a multiple of the
1229          number of the interleaved accesses.  */
1230       int elem_size, mis_in_elements;
1231
1232       /* FORNOW: handle only known alignment.  */
1233       if (!known_alignment_for_access_p (dr_info))
1234         return false;
1235
1236       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1237       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1238       elem_size = vector_element_size (vector_size, nelements);
1239       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1240
1241       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1242         return false;
1243     }
1244
1245   /* If misalignment is known at the compile time then allow peeling
1246      only if natural alignment is reachable through peeling.  */
1247   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1248     {
1249       HOST_WIDE_INT elmsize =
1250                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1251       if (dump_enabled_p ())
1252         {
1253           dump_printf_loc (MSG_NOTE, vect_location,
1254                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1255           dump_printf (MSG_NOTE,
1256                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr_info));
1257         }
1258       if (DR_MISALIGNMENT (dr_info) % elmsize)
1259         {
1260           if (dump_enabled_p ())
1261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1262                              "data size does not divide the misalignment.\n");
1263           return false;
1264         }
1265     }
1266
1267   if (!known_alignment_for_access_p (dr_info))
1268     {
1269       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1270       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1271       if (dump_enabled_p ())
1272         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273                          "Unknown misalignment, %snaturally aligned\n",
1274                          is_packed ? "not " : "");
1275       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1276     }
1277
1278   return true;
1279 }
1280
1281
1282 /* Calculate the cost of the memory access represented by DR_INFO.  */
1283
1284 static void
1285 vect_get_data_access_cost (dr_vec_info *dr_info,
1286                            unsigned int *inside_cost,
1287                            unsigned int *outside_cost,
1288                            stmt_vector_for_cost *body_cost_vec,
1289                            stmt_vector_for_cost *prologue_cost_vec)
1290 {
1291   stmt_vec_info stmt_info = dr_info->stmt;
1292   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1293   int ncopies;
1294
1295   if (PURE_SLP_STMT (stmt_info))
1296     ncopies = 1;
1297   else
1298     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1299
1300   if (DR_IS_READ (dr_info->dr))
1301     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1302                         prologue_cost_vec, body_cost_vec, false);
1303   else
1304     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1305
1306   if (dump_enabled_p ())
1307     dump_printf_loc (MSG_NOTE, vect_location,
1308                      "vect_get_data_access_cost: inside_cost = %d, "
1309                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1310 }
1311
1312
1313 typedef struct _vect_peel_info
1314 {
1315   dr_vec_info *dr_info;
1316   int npeel;
1317   unsigned int count;
1318 } *vect_peel_info;
1319
1320 typedef struct _vect_peel_extended_info
1321 {
1322   struct _vect_peel_info peel_info;
1323   unsigned int inside_cost;
1324   unsigned int outside_cost;
1325 } *vect_peel_extended_info;
1326
1327
1328 /* Peeling hashtable helpers.  */
1329
1330 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1331 {
1332   static inline hashval_t hash (const _vect_peel_info *);
1333   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1334 };
1335
1336 inline hashval_t
1337 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1338 {
1339   return (hashval_t) peel_info->npeel;
1340 }
1341
1342 inline bool
1343 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1344 {
1345   return (a->npeel == b->npeel);
1346 }
1347
1348
1349 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1350
1351 static void
1352 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1353                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1354                           int npeel)
1355 {
1356   struct _vect_peel_info elem, *slot;
1357   _vect_peel_info **new_slot;
1358   bool supportable_dr_alignment
1359     = vect_supportable_dr_alignment (dr_info, true);
1360
1361   elem.npeel = npeel;
1362   slot = peeling_htab->find (&elem);
1363   if (slot)
1364     slot->count++;
1365   else
1366     {
1367       slot = XNEW (struct _vect_peel_info);
1368       slot->npeel = npeel;
1369       slot->dr_info = dr_info;
1370       slot->count = 1;
1371       new_slot = peeling_htab->find_slot (slot, INSERT);
1372       *new_slot = slot;
1373     }
1374
1375   if (!supportable_dr_alignment
1376       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1377     slot->count += VECT_MAX_COST;
1378 }
1379
1380
1381 /* Traverse peeling hash table to find peeling option that aligns maximum
1382    number of data accesses.  */
1383
1384 int
1385 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1386                                      _vect_peel_extended_info *max)
1387 {
1388   vect_peel_info elem = *slot;
1389
1390   if (elem->count > max->peel_info.count
1391       || (elem->count == max->peel_info.count
1392           && max->peel_info.npeel > elem->npeel))
1393     {
1394       max->peel_info.npeel = elem->npeel;
1395       max->peel_info.count = elem->count;
1396       max->peel_info.dr_info = elem->dr_info;
1397     }
1398
1399   return 1;
1400 }
1401
1402 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1403    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1404    we assume DR0_INFO's misalignment will be zero after peeling.  */
1405
1406 static void
1407 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1408                                 dr_vec_info *dr0_info,
1409                                 unsigned int *inside_cost,
1410                                 unsigned int *outside_cost,
1411                                 stmt_vector_for_cost *body_cost_vec,
1412                                 stmt_vector_for_cost *prologue_cost_vec,
1413                                 unsigned int npeel,
1414                                 bool unknown_misalignment)
1415 {
1416   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1417   unsigned i;
1418   data_reference *dr;
1419
1420   FOR_EACH_VEC_ELT (datarefs, i, dr)
1421     {
1422       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1423       stmt_vec_info stmt_info = dr_info->stmt;
1424       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1425         continue;
1426
1427       /* For interleaving, only the alignment of the first access
1428          matters.  */
1429       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1430           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1431         continue;
1432
1433       /* Strided accesses perform only component accesses, alignment is
1434          irrelevant for them.  */
1435       if (STMT_VINFO_STRIDED_P (stmt_info)
1436           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1437         continue;
1438
1439       int save_misalignment;
1440       save_misalignment = DR_MISALIGNMENT (dr_info);
1441       if (npeel == 0)
1442         ;
1443       else if (unknown_misalignment && dr_info == dr0_info)
1444         SET_DR_MISALIGNMENT (dr_info, 0);
1445       else
1446         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1447       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1448                                  body_cost_vec, prologue_cost_vec);
1449       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1450     }
1451 }
1452
1453 /* Traverse peeling hash table and calculate cost for each peeling option.
1454    Find the one with the lowest cost.  */
1455
1456 int
1457 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1458                                    _vect_peel_extended_info *min)
1459 {
1460   vect_peel_info elem = *slot;
1461   int dummy;
1462   unsigned int inside_cost = 0, outside_cost = 0;
1463   stmt_vec_info stmt_info = elem->dr_info->stmt;
1464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1465   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1466                        epilogue_cost_vec;
1467
1468   prologue_cost_vec.create (2);
1469   body_cost_vec.create (2);
1470   epilogue_cost_vec.create (2);
1471
1472   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1473                                   &outside_cost, &body_cost_vec,
1474                                   &prologue_cost_vec, elem->npeel, false);
1475
1476   body_cost_vec.release ();
1477
1478   outside_cost += vect_get_known_peeling_cost
1479     (loop_vinfo, elem->npeel, &dummy,
1480      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1481      &prologue_cost_vec, &epilogue_cost_vec);
1482
1483   /* Prologue and epilogue costs are added to the target model later.
1484      These costs depend only on the scalar iteration cost, the
1485      number of peeling iterations finally chosen, and the number of
1486      misaligned statements.  So discard the information found here.  */
1487   prologue_cost_vec.release ();
1488   epilogue_cost_vec.release ();
1489
1490   if (inside_cost < min->inside_cost
1491       || (inside_cost == min->inside_cost
1492           && outside_cost < min->outside_cost))
1493     {
1494       min->inside_cost = inside_cost;
1495       min->outside_cost = outside_cost;
1496       min->peel_info.dr_info = elem->dr_info;
1497       min->peel_info.npeel = elem->npeel;
1498       min->peel_info.count = elem->count;
1499     }
1500
1501   return 1;
1502 }
1503
1504
1505 /* Choose best peeling option by traversing peeling hash table and either
1506    choosing an option with the lowest cost (if cost model is enabled) or the
1507    option that aligns as many accesses as possible.  */
1508
1509 static struct _vect_peel_extended_info
1510 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1511                                        loop_vec_info loop_vinfo)
1512 {
1513    struct _vect_peel_extended_info res;
1514
1515    res.peel_info.dr_info = NULL;
1516
1517    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1518      {
1519        res.inside_cost = INT_MAX;
1520        res.outside_cost = INT_MAX;
1521        peeling_htab->traverse <_vect_peel_extended_info *,
1522                                vect_peeling_hash_get_lowest_cost> (&res);
1523      }
1524    else
1525      {
1526        res.peel_info.count = 0;
1527        peeling_htab->traverse <_vect_peel_extended_info *,
1528                                vect_peeling_hash_get_most_frequent> (&res);
1529        res.inside_cost = 0;
1530        res.outside_cost = 0;
1531      }
1532
1533    return res;
1534 }
1535
1536 /* Return true if the new peeling NPEEL is supported.  */
1537
1538 static bool
1539 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1540                           unsigned npeel)
1541 {
1542   unsigned i;
1543   struct data_reference *dr = NULL;
1544   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1545   enum dr_alignment_support supportable_dr_alignment;
1546
1547   /* Ensure that all data refs can be vectorized after the peel.  */
1548   FOR_EACH_VEC_ELT (datarefs, i, dr)
1549     {
1550       int save_misalignment;
1551
1552       if (dr == dr0_info->dr)
1553         continue;
1554
1555       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1556       stmt_vec_info stmt_info = dr_info->stmt;
1557       /* For interleaving, only the alignment of the first access
1558          matters.  */
1559       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1560           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1561         continue;
1562
1563       /* Strided accesses perform only component accesses, alignment is
1564          irrelevant for them.  */
1565       if (STMT_VINFO_STRIDED_P (stmt_info)
1566           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1567         continue;
1568
1569       save_misalignment = DR_MISALIGNMENT (dr_info);
1570       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1571       supportable_dr_alignment
1572         = vect_supportable_dr_alignment (dr_info, false);
1573       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1574
1575       if (!supportable_dr_alignment)
1576         return false;
1577     }
1578
1579   return true;
1580 }
1581
1582 /* Function vect_enhance_data_refs_alignment
1583
1584    This pass will use loop versioning and loop peeling in order to enhance
1585    the alignment of data references in the loop.
1586
1587    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1588    original loop is to be vectorized.  Any other loops that are created by
1589    the transformations performed in this pass - are not supposed to be
1590    vectorized.  This restriction will be relaxed.
1591
1592    This pass will require a cost model to guide it whether to apply peeling
1593    or versioning or a combination of the two.  For example, the scheme that
1594    intel uses when given a loop with several memory accesses, is as follows:
1595    choose one memory access ('p') which alignment you want to force by doing
1596    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1597    other accesses are not necessarily aligned, or (2) use loop versioning to
1598    generate one loop in which all accesses are aligned, and another loop in
1599    which only 'p' is necessarily aligned.
1600
1601    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1602    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1603    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1604
1605    Devising a cost model is the most critical aspect of this work.  It will
1606    guide us on which access to peel for, whether to use loop versioning, how
1607    many versions to create, etc.  The cost model will probably consist of
1608    generic considerations as well as target specific considerations (on
1609    powerpc for example, misaligned stores are more painful than misaligned
1610    loads).
1611
1612    Here are the general steps involved in alignment enhancements:
1613
1614      -- original loop, before alignment analysis:
1615         for (i=0; i<N; i++){
1616           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1617           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1618         }
1619
1620      -- After vect_compute_data_refs_alignment:
1621         for (i=0; i<N; i++){
1622           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1623           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1624         }
1625
1626      -- Possibility 1: we do loop versioning:
1627      if (p is aligned) {
1628         for (i=0; i<N; i++){    # loop 1A
1629           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1630           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1631         }
1632      }
1633      else {
1634         for (i=0; i<N; i++){    # loop 1B
1635           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1636           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1637         }
1638      }
1639
1640      -- Possibility 2: we do loop peeling:
1641      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1642         x = q[i];
1643         p[i] = y;
1644      }
1645      for (i = 3; i < N; i++){   # loop 2A
1646         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1647         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1648      }
1649
1650      -- Possibility 3: combination of loop peeling and versioning:
1651      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1652         x = q[i];
1653         p[i] = y;
1654      }
1655      if (p is aligned) {
1656         for (i = 3; i<N; i++){  # loop 3A
1657           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1658           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1659         }
1660      }
1661      else {
1662         for (i = 3; i<N; i++){  # loop 3B
1663           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1664           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1665         }
1666      }
1667
1668      These loops are later passed to loop_transform to be vectorized.  The
1669      vectorizer will use the alignment information to guide the transformation
1670      (whether to generate regular loads/stores, or with special handling for
1671      misalignment).  */
1672
1673 bool
1674 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1675 {
1676   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1677   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678   enum dr_alignment_support supportable_dr_alignment;
1679   dr_vec_info *first_store = NULL;
1680   dr_vec_info *dr0_info = NULL;
1681   struct data_reference *dr;
1682   unsigned int i, j;
1683   bool do_peeling = false;
1684   bool do_versioning = false;
1685   bool stat;
1686   unsigned int npeel = 0;
1687   bool one_misalignment_known = false;
1688   bool one_misalignment_unknown = false;
1689   bool one_dr_unsupportable = false;
1690   dr_vec_info *unsupportable_dr_info = NULL;
1691   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1692   unsigned possible_npeel_number = 1;
1693   tree vectype;
1694   unsigned int mis, same_align_drs_max = 0;
1695   hash_table<peel_info_hasher> peeling_htab (1);
1696
1697   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1698
1699   /* Reset data so we can safely be called multiple times.  */
1700   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1701   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1702
1703   /* While cost model enhancements are expected in the future, the high level
1704      view of the code at this time is as follows:
1705
1706      A) If there is a misaligned access then see if peeling to align
1707         this access can make all data references satisfy
1708         vect_supportable_dr_alignment.  If so, update data structures
1709         as needed and return true.
1710
1711      B) If peeling wasn't possible and there is a data reference with an
1712         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1713         then see if loop versioning checks can be used to make all data
1714         references satisfy vect_supportable_dr_alignment.  If so, update
1715         data structures as needed and return true.
1716
1717      C) If neither peeling nor versioning were successful then return false if
1718         any data reference does not satisfy vect_supportable_dr_alignment.
1719
1720      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1721
1722      Note, Possibility 3 above (which is peeling and versioning together) is not
1723      being done at this time.  */
1724
1725   /* (1) Peeling to force alignment.  */
1726
1727   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1728      Considerations:
1729      + How many accesses will become aligned due to the peeling
1730      - How many accesses will become unaligned due to the peeling,
1731        and the cost of misaligned accesses.
1732      - The cost of peeling (the extra runtime checks, the increase
1733        in code size).  */
1734
1735   FOR_EACH_VEC_ELT (datarefs, i, dr)
1736     {
1737       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1738       stmt_vec_info stmt_info = dr_info->stmt;
1739
1740       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1741         continue;
1742
1743       /* For interleaving, only the alignment of the first access
1744          matters.  */
1745       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1746           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1747         continue;
1748
1749       /* For scatter-gather or invariant accesses there is nothing
1750          to enhance.  */
1751       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1752           || integer_zerop (DR_STEP (dr)))
1753         continue;
1754
1755       /* Strided accesses perform only component accesses, alignment is
1756          irrelevant for them.  */
1757       if (STMT_VINFO_STRIDED_P (stmt_info)
1758           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1759         continue;
1760
1761       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1762       do_peeling = vector_alignment_reachable_p (dr_info);
1763       if (do_peeling)
1764         {
1765           if (known_alignment_for_access_p (dr_info))
1766             {
1767               unsigned int npeel_tmp = 0;
1768               bool negative = tree_int_cst_compare (DR_STEP (dr),
1769                                                     size_zero_node) < 0;
1770
1771               vectype = STMT_VINFO_VECTYPE (stmt_info);
1772               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1773               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774               mis = (negative
1775                      ? DR_MISALIGNMENT (dr_info)
1776                      : -DR_MISALIGNMENT (dr_info));
1777               if (DR_MISALIGNMENT (dr_info) != 0)
1778                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780               /* For multiple types, it is possible that the bigger type access
1781                  will have more than one peeling option.  E.g., a loop with two
1782                  types: one of size (vector size / 4), and the other one of
1783                  size (vector size / 8).  Vectorization factor will 8.  If both
1784                  accesses are misaligned by 3, the first one needs one scalar
1785                  iteration to be aligned, and the second one needs 5.  But the
1786                  first one will be aligned also by peeling 5 scalar
1787                  iterations, and in that case both accesses will be aligned.
1788                  Hence, except for the immediate peeling amount, we also want
1789                  to try to add full vector size, while we don't exceed
1790                  vectorization factor.
1791                  We do this automatically for cost model, since we calculate
1792                  cost for every peeling option.  */
1793               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794                 {
1795                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797                   possible_npeel_number
1798                     = vect_get_num_vectors (nscalars, vectype);
1799
1800                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1801                      allow peeling NELEMENTS.  */
1802                   if (DR_MISALIGNMENT (dr_info) == 0)
1803                     possible_npeel_number++;
1804                 }
1805
1806               /* Save info about DR in the hash table.  Also include peeling
1807                  amounts according to the explanation above.  */
1808               for (j = 0; j < possible_npeel_number; j++)
1809                 {
1810                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811                                             dr_info, npeel_tmp);
1812                   npeel_tmp += target_align / dr_size;
1813                 }
1814
1815               one_misalignment_known = true;
1816             }
1817           else
1818             {
1819               /* If we don't know any misalignment values, we prefer
1820                  peeling for data-ref that has the maximum number of data-refs
1821                  with the same alignment, unless the target prefers to align
1822                  stores over load.  */
1823               unsigned same_align_drs
1824                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825               if (!dr0_info
1826                   || same_align_drs_max < same_align_drs)
1827                 {
1828                   same_align_drs_max = same_align_drs;
1829                   dr0_info = dr_info;
1830                 }
1831               /* For data-refs with the same number of related
1832                  accesses prefer the one where the misalign
1833                  computation will be invariant in the outermost loop.  */
1834               else if (same_align_drs_max == same_align_drs)
1835                 {
1836                   struct loop *ivloop0, *ivloop;
1837                   ivloop0 = outermost_invariant_loop_for_expr
1838                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839                   ivloop = outermost_invariant_loop_for_expr
1840                     (loop, DR_BASE_ADDRESS (dr));
1841                   if ((ivloop && !ivloop0)
1842                       || (ivloop && ivloop0
1843                           && flow_loop_nested_p (ivloop, ivloop0)))
1844                     dr0_info = dr_info;
1845                 }
1846
1847               one_misalignment_unknown = true;
1848
1849               /* Check for data refs with unsupportable alignment that
1850                  can be peeled.  */
1851               if (!supportable_dr_alignment)
1852               {
1853                 one_dr_unsupportable = true;
1854                 unsupportable_dr_info = dr_info;
1855               }
1856
1857               if (!first_store && DR_IS_WRITE (dr))
1858                 first_store = dr_info;
1859             }
1860         }
1861       else
1862         {
1863           if (!aligned_access_p (dr_info))
1864             {
1865               if (dump_enabled_p ())
1866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                  "vector alignment may not be reachable\n");
1868               break;
1869             }
1870         }
1871     }
1872
1873   /* Check if we can possibly peel the loop.  */
1874   if (!vect_can_advance_ivs_p (loop_vinfo)
1875       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876       || loop->inner)
1877     do_peeling = false;
1878
1879   struct _vect_peel_extended_info peel_for_known_alignment;
1880   struct _vect_peel_extended_info peel_for_unknown_alignment;
1881   struct _vect_peel_extended_info best_peel;
1882
1883   peel_for_unknown_alignment.inside_cost = INT_MAX;
1884   peel_for_unknown_alignment.outside_cost = INT_MAX;
1885   peel_for_unknown_alignment.peel_info.count = 0;
1886
1887   if (do_peeling
1888       && one_misalignment_unknown)
1889     {
1890       /* Check if the target requires to prefer stores over loads, i.e., if
1891          misaligned stores are more expensive than misaligned loads (taking
1892          drs with same alignment into account).  */
1893       unsigned int load_inside_cost = 0;
1894       unsigned int load_outside_cost = 0;
1895       unsigned int store_inside_cost = 0;
1896       unsigned int store_outside_cost = 0;
1897       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899       stmt_vector_for_cost dummy;
1900       dummy.create (2);
1901       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902                                       &load_inside_cost,
1903                                       &load_outside_cost,
1904                                       &dummy, &dummy, estimated_npeels, true);
1905       dummy.release ();
1906
1907       if (first_store)
1908         {
1909           dummy.create (2);
1910           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911                                           &store_inside_cost,
1912                                           &store_outside_cost,
1913                                           &dummy, &dummy,
1914                                           estimated_npeels, true);
1915           dummy.release ();
1916         }
1917       else
1918         {
1919           store_inside_cost = INT_MAX;
1920           store_outside_cost = INT_MAX;
1921         }
1922
1923       if (load_inside_cost > store_inside_cost
1924           || (load_inside_cost == store_inside_cost
1925               && load_outside_cost > store_outside_cost))
1926         {
1927           dr0_info = first_store;
1928           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930         }
1931       else
1932         {
1933           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935         }
1936
1937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938       prologue_cost_vec.create (2);
1939       epilogue_cost_vec.create (2);
1940
1941       int dummy2;
1942       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943         (loop_vinfo, estimated_npeels, &dummy2,
1944          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945          &prologue_cost_vec, &epilogue_cost_vec);
1946
1947       prologue_cost_vec.release ();
1948       epilogue_cost_vec.release ();
1949
1950       peel_for_unknown_alignment.peel_info.count = 1
1951         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952     }
1953
1954   peel_for_unknown_alignment.peel_info.npeel = 0;
1955   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957   best_peel = peel_for_unknown_alignment;
1958
1959   peel_for_known_alignment.inside_cost = INT_MAX;
1960   peel_for_known_alignment.outside_cost = INT_MAX;
1961   peel_for_known_alignment.peel_info.count = 0;
1962   peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964   if (do_peeling && one_misalignment_known)
1965     {
1966       /* Peeling is possible, but there is no data access that is not supported
1967          unless aligned.  So we try to choose the best possible peeling from
1968          the hash table.  */
1969       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970         (&peeling_htab, loop_vinfo);
1971     }
1972
1973   /* Compare costs of peeling for known and unknown alignment. */
1974   if (peel_for_known_alignment.peel_info.dr_info != NULL
1975       && peel_for_unknown_alignment.inside_cost
1976       >= peel_for_known_alignment.inside_cost)
1977     {
1978       best_peel = peel_for_known_alignment;
1979
1980       /* If the best peeling for known alignment has NPEEL == 0, perform no
1981          peeling at all except if there is an unsupportable dr that we can
1982          align.  */
1983       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984         do_peeling = false;
1985     }
1986
1987   /* If there is an unsupportable data ref, prefer this over all choices so far
1988      since we'd have to discard a chosen peeling except when it accidentally
1989      aligned the unsupportable data ref.  */
1990   if (one_dr_unsupportable)
1991     dr0_info = unsupportable_dr_info;
1992   else if (do_peeling)
1993     {
1994       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995          TODO: Use nopeel_outside_cost or get rid of it?  */
1996       unsigned nopeel_inside_cost = 0;
1997       unsigned nopeel_outside_cost = 0;
1998
1999       stmt_vector_for_cost dummy;
2000       dummy.create (2);
2001       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002                                       &nopeel_outside_cost, &dummy, &dummy,
2003                                       0, false);
2004       dummy.release ();
2005
2006       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007          costs will be recorded.  */
2008       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009       prologue_cost_vec.create (2);
2010       epilogue_cost_vec.create (2);
2011
2012       int dummy2;
2013       nopeel_outside_cost += vect_get_known_peeling_cost
2014         (loop_vinfo, 0, &dummy2,
2015          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016          &prologue_cost_vec, &epilogue_cost_vec);
2017
2018       prologue_cost_vec.release ();
2019       epilogue_cost_vec.release ();
2020
2021       npeel = best_peel.peel_info.npeel;
2022       dr0_info = best_peel.peel_info.dr_info;
2023
2024       /* If no peeling is not more expensive than the best peeling we
2025          have so far, don't perform any peeling.  */
2026       if (nopeel_inside_cost <= best_peel.inside_cost)
2027         do_peeling = false;
2028     }
2029
2030   if (do_peeling)
2031     {
2032       stmt_vec_info stmt_info = dr0_info->stmt;
2033       vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035       if (known_alignment_for_access_p (dr0_info))
2036         {
2037           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038                                                 size_zero_node) < 0;
2039           if (!npeel)
2040             {
2041               /* Since it's known at compile time, compute the number of
2042                  iterations in the peeled loop (the peeling factor) for use in
2043                  updating DR_MISALIGNMENT values.  The peeling factor is the
2044                  vectorization factor minus the misalignment as an element
2045                  count.  */
2046               mis = (negative
2047                      ? DR_MISALIGNMENT (dr0_info)
2048                      : -DR_MISALIGNMENT (dr0_info));
2049               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2050               npeel = ((mis & (target_align - 1))
2051                        / vect_get_scalar_dr_size (dr0_info));
2052             }
2053
2054           /* For interleaved data access every iteration accesses all the
2055              members of the group, therefore we divide the number of iterations
2056              by the group size.  */
2057           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2058             npeel /= DR_GROUP_SIZE (stmt_info);
2059
2060           if (dump_enabled_p ())
2061             dump_printf_loc (MSG_NOTE, vect_location,
2062                              "Try peeling by %d\n", npeel);
2063         }
2064
2065       /* Ensure that all datarefs can be vectorized after the peel.  */
2066       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2067         do_peeling = false;
2068
2069       /* Check if all datarefs are supportable and log.  */
2070       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2071         {
2072           stat = vect_verify_datarefs_alignment (loop_vinfo);
2073           if (!stat)
2074             do_peeling = false;
2075           else
2076             return stat;
2077         }
2078
2079       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2080       if (do_peeling)
2081         {
2082           unsigned max_allowed_peel
2083             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2084           if (max_allowed_peel != (unsigned)-1)
2085             {
2086               unsigned max_peel = npeel;
2087               if (max_peel == 0)
2088                 {
2089                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2090                   max_peel = (target_align
2091                               / vect_get_scalar_dr_size (dr0_info) - 1);
2092                 }
2093               if (max_peel > max_allowed_peel)
2094                 {
2095                   do_peeling = false;
2096                   if (dump_enabled_p ())
2097                     dump_printf_loc (MSG_NOTE, vect_location,
2098                         "Disable peeling, max peels reached: %d\n", max_peel);
2099                 }
2100             }
2101         }
2102
2103       /* Cost model #2 - if peeling may result in a remaining loop not
2104          iterating enough to be vectorized then do not peel.  Since this
2105          is a cost heuristic rather than a correctness decision, use the
2106          most likely runtime value for variable vectorization factors.  */
2107       if (do_peeling
2108           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2109         {
2110           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2111           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2112           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2113               < assumed_vf + max_peel)
2114             do_peeling = false;
2115         }
2116
2117       if (do_peeling)
2118         {
2119           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2120              If the misalignment of DR_i is identical to that of dr0 then set
2121              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2122              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2123              by the peeling factor times the element size of DR_i (MOD the
2124              vectorization factor times the size).  Otherwise, the
2125              misalignment of DR_i must be set to unknown.  */
2126           FOR_EACH_VEC_ELT (datarefs, i, dr)
2127             if (dr != dr0_info->dr)
2128               {
2129                 /* Strided accesses perform only component accesses, alignment
2130                    is irrelevant for them.  */
2131                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2132                 stmt_info = dr_info->stmt;
2133                 if (STMT_VINFO_STRIDED_P (stmt_info)
2134                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2135                   continue;
2136
2137                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2138               }
2139
2140           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2141           if (npeel)
2142             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2143           else
2144             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2145               = DR_MISALIGNMENT (dr0_info);
2146           SET_DR_MISALIGNMENT (dr0_info, 0);
2147           if (dump_enabled_p ())
2148             {
2149               dump_printf_loc (MSG_NOTE, vect_location,
2150                                "Alignment of access forced using peeling.\n");
2151               dump_printf_loc (MSG_NOTE, vect_location,
2152                                "Peeling for alignment will be applied.\n");
2153             }
2154
2155           /* The inside-loop cost will be accounted for in vectorizable_load
2156              and vectorizable_store correctly with adjusted alignments.
2157              Drop the body_cst_vec on the floor here.  */
2158           stat = vect_verify_datarefs_alignment (loop_vinfo);
2159           gcc_assert (stat);
2160           return stat;
2161         }
2162     }
2163
2164   /* (2) Versioning to force alignment.  */
2165
2166   /* Try versioning if:
2167      1) optimize loop for speed
2168      2) there is at least one unsupported misaligned data ref with an unknown
2169         misalignment, and
2170      3) all misaligned data refs with a known misalignment are supported, and
2171      4) the number of runtime alignment checks is within reason.  */
2172
2173   do_versioning =
2174         optimize_loop_nest_for_speed_p (loop)
2175         && (!loop->inner); /* FORNOW */
2176
2177   if (do_versioning)
2178     {
2179       FOR_EACH_VEC_ELT (datarefs, i, dr)
2180         {
2181           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2182           stmt_vec_info stmt_info = dr_info->stmt;
2183
2184           /* For interleaving, only the alignment of the first access
2185              matters.  */
2186           if (aligned_access_p (dr_info)
2187               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2188                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2189             continue;
2190
2191           if (STMT_VINFO_STRIDED_P (stmt_info))
2192             {
2193               /* Strided loads perform only component accesses, alignment is
2194                  irrelevant for them.  */
2195               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2196                 continue;
2197               do_versioning = false;
2198               break;
2199             }
2200
2201           supportable_dr_alignment
2202             = vect_supportable_dr_alignment (dr_info, false);
2203
2204           if (!supportable_dr_alignment)
2205             {
2206               int mask;
2207               tree vectype;
2208
2209               if (known_alignment_for_access_p (dr_info)
2210                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2211                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2212                 {
2213                   do_versioning = false;
2214                   break;
2215                 }
2216
2217               vectype = STMT_VINFO_VECTYPE (stmt_info);
2218               gcc_assert (vectype);
2219
2220               /* At present we don't support versioning for alignment
2221                  with variable VF, since there's no guarantee that the
2222                  VF is a power of two.  We could relax this if we added
2223                  a way of enforcing a power-of-two size.  */
2224               unsigned HOST_WIDE_INT size;
2225               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2226                 {
2227                   do_versioning = false;
2228                   break;
2229                 }
2230
2231               /* The rightmost bits of an aligned address must be zeros.
2232                  Construct the mask needed for this test.  For example,
2233                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2234                  mask must be 15 = 0xf. */
2235               mask = size - 1;
2236
2237               /* FORNOW: use the same mask to test all potentially unaligned
2238                  references in the loop.  The vectorizer currently supports
2239                  a single vector size, see the reference to
2240                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2241                  vectorization factor is computed.  */
2242               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2243                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2244               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2245               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2246             }
2247         }
2248
2249       /* Versioning requires at least one misaligned data reference.  */
2250       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2251         do_versioning = false;
2252       else if (!do_versioning)
2253         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2254     }
2255
2256   if (do_versioning)
2257     {
2258       vec<stmt_vec_info> may_misalign_stmts
2259         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2260       stmt_vec_info stmt_info;
2261
2262       /* It can now be assumed that the data references in the statements
2263          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2264          of the loop being vectorized.  */
2265       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2266         {
2267           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2268           SET_DR_MISALIGNMENT (dr_info, 0);
2269           if (dump_enabled_p ())
2270             dump_printf_loc (MSG_NOTE, vect_location,
2271                              "Alignment of access forced using versioning.\n");
2272         }
2273
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_NOTE, vect_location,
2276                          "Versioning for alignment will be applied.\n");
2277
2278       /* Peeling and versioning can't be done together at this time.  */
2279       gcc_assert (! (do_peeling && do_versioning));
2280
2281       stat = vect_verify_datarefs_alignment (loop_vinfo);
2282       gcc_assert (stat);
2283       return stat;
2284     }
2285
2286   /* This point is reached if neither peeling nor versioning is being done.  */
2287   gcc_assert (! (do_peeling || do_versioning));
2288
2289   stat = vect_verify_datarefs_alignment (loop_vinfo);
2290   return stat;
2291 }
2292
2293
2294 /* Function vect_find_same_alignment_drs.
2295
2296    Update group and alignment relations in VINFO according to the chosen
2297    vectorization factor.  */
2298
2299 static void
2300 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2301 {
2302   struct data_reference *dra = DDR_A (ddr);
2303   struct data_reference *drb = DDR_B (ddr);
2304   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2305   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2306   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2307   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2308
2309   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2310     return;
2311
2312   if (dra == drb)
2313     return;
2314
2315   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2316       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2317     return;
2318
2319   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2320       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2321       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2322     return;
2323
2324   /* Two references with distance zero have the same alignment.  */
2325   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2326                           - wi::to_poly_offset (DR_INIT (drb)));
2327   if (maybe_ne (diff, 0))
2328     {
2329       /* Get the wider of the two alignments.  */
2330       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2331                               / BITS_PER_UNIT);
2332       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2333                               / BITS_PER_UNIT);
2334       unsigned int max_align = MAX (align_a, align_b);
2335
2336       /* Require the gap to be a multiple of the larger vector alignment.  */
2337       if (!multiple_p (diff, max_align))
2338         return;
2339     }
2340
2341   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2342   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2343   if (dump_enabled_p ())
2344     {
2345       dump_printf_loc (MSG_NOTE, vect_location,
2346                        "accesses have the same alignment: ");
2347       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2348       dump_printf (MSG_NOTE,  " and ");
2349       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2350       dump_printf (MSG_NOTE, "\n");
2351     }
2352 }
2353
2354
2355 /* Function vect_analyze_data_refs_alignment
2356
2357    Analyze the alignment of the data-references in the loop.
2358    Return FALSE if a data reference is found that cannot be vectorized.  */
2359
2360 bool
2361 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2362 {
2363   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2364
2365   /* Mark groups of data references with same alignment using
2366      data dependence information.  */
2367   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2368   struct data_dependence_relation *ddr;
2369   unsigned int i;
2370
2371   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2372     vect_find_same_alignment_drs (vinfo, ddr);
2373
2374   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2375   struct data_reference *dr;
2376
2377   vect_record_base_alignments (vinfo);
2378   FOR_EACH_VEC_ELT (datarefs, i, dr)
2379     {
2380       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2381       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2382         vect_compute_data_ref_alignment (dr_info);
2383     }
2384
2385   return true;
2386 }
2387
2388
2389 /* Analyze alignment of DRs of stmts in NODE.  */
2390
2391 static bool
2392 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2393 {
2394   /* We vectorize from the first scalar stmt in the node unless
2395      the node is permuted in which case we start from the first
2396      element in the group.  */
2397   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2398   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2399   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2400     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2401
2402   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2403   vect_compute_data_ref_alignment (dr_info);
2404   /* For creating the data-ref pointer we need alignment of the
2405      first element anyway.  */
2406   if (dr_info != first_dr_info)
2407     vect_compute_data_ref_alignment (first_dr_info);
2408   if (! verify_data_ref_alignment (dr_info))
2409     {
2410       if (dump_enabled_p ())
2411         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2412                          "not vectorized: bad data alignment in basic "
2413                          "block.\n");
2414       return false;
2415     }
2416
2417   return true;
2418 }
2419
2420 /* Function vect_slp_analyze_instance_alignment
2421
2422    Analyze the alignment of the data-references in the SLP instance.
2423    Return FALSE if a data reference is found that cannot be vectorized.  */
2424
2425 bool
2426 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2427 {
2428   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2429
2430   slp_tree node;
2431   unsigned i;
2432   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2433     if (! vect_slp_analyze_and_verify_node_alignment (node))
2434       return false;
2435
2436   node = SLP_INSTANCE_TREE (instance);
2437   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2438       && ! vect_slp_analyze_and_verify_node_alignment
2439              (SLP_INSTANCE_TREE (instance)))
2440     return false;
2441
2442   return true;
2443 }
2444
2445
2446 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2447    accesses of legal size, step, etc.  Detect gaps, single element
2448    interleaving, and other special cases. Set grouped access info.
2449    Collect groups of strided stores for further use in SLP analysis.
2450    Worker for vect_analyze_group_access.  */
2451
2452 static bool
2453 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2454 {
2455   data_reference *dr = dr_info->dr;
2456   tree step = DR_STEP (dr);
2457   tree scalar_type = TREE_TYPE (DR_REF (dr));
2458   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2459   stmt_vec_info stmt_info = dr_info->stmt;
2460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2461   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2462   HOST_WIDE_INT dr_step = -1;
2463   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2464   bool slp_impossible = false;
2465
2466   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2467      size of the interleaving group (including gaps).  */
2468   if (tree_fits_shwi_p (step))
2469     {
2470       dr_step = tree_to_shwi (step);
2471       /* Check that STEP is a multiple of type size.  Otherwise there is
2472          a non-element-sized gap at the end of the group which we
2473          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2474          ???  As we can handle non-constant step fine here we should
2475          simply remove uses of DR_GROUP_GAP between the last and first
2476          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2477          simply not include that gap.  */
2478       if ((dr_step % type_size) != 0)
2479         {
2480           if (dump_enabled_p ())
2481             {
2482               dump_printf_loc (MSG_NOTE, vect_location,
2483                                "Step ");
2484               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2485               dump_printf (MSG_NOTE,
2486                            " is not a multiple of the element size for ");
2487               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2488               dump_printf (MSG_NOTE, "\n");
2489             }
2490           return false;
2491         }
2492       groupsize = absu_hwi (dr_step) / type_size;
2493     }
2494   else
2495     groupsize = 0;
2496
2497   /* Not consecutive access is possible only if it is a part of interleaving.  */
2498   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2499     {
2500       /* Check if it this DR is a part of interleaving, and is a single
2501          element of the group that is accessed in the loop.  */
2502
2503       /* Gaps are supported only for loads. STEP must be a multiple of the type
2504          size.  */
2505       if (DR_IS_READ (dr)
2506           && (dr_step % type_size) == 0
2507           && groupsize > 0)
2508         {
2509           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2510           DR_GROUP_SIZE (stmt_info) = groupsize;
2511           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2512           if (dump_enabled_p ())
2513             {
2514               dump_printf_loc (MSG_NOTE, vect_location,
2515                                "Detected single element interleaving ");
2516               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2517               dump_printf (MSG_NOTE, " step ");
2518               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2519               dump_printf (MSG_NOTE, "\n");
2520             }
2521
2522           return true;
2523         }
2524
2525       if (dump_enabled_p ())
2526         {
2527           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528                            "not consecutive access ");
2529           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2530                             stmt_info->stmt, 0);
2531         }
2532
2533       if (bb_vinfo)
2534         {
2535           /* Mark the statement as unvectorizable.  */
2536           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2537           return true;
2538         }
2539
2540       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2541       STMT_VINFO_STRIDED_P (stmt_info) = true;
2542       return true;
2543     }
2544
2545   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2546     {
2547       /* First stmt in the interleaving chain. Check the chain.  */
2548       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2549       struct data_reference *data_ref = dr;
2550       unsigned int count = 1;
2551       tree prev_init = DR_INIT (data_ref);
2552       stmt_vec_info prev = stmt_info;
2553       HOST_WIDE_INT diff, gaps = 0;
2554
2555       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2556       while (next)
2557         {
2558           /* Skip same data-refs.  In case that two or more stmts share
2559              data-ref (supported only for loads), we vectorize only the first
2560              stmt, and the rest get their vectorized loads from the first
2561              one.  */
2562           if (!tree_int_cst_compare (DR_INIT (data_ref),
2563                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2564             {
2565               if (DR_IS_WRITE (data_ref))
2566                 {
2567                   if (dump_enabled_p ())
2568                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569                                      "Two store stmts share the same dr.\n");
2570                   return false;
2571                 }
2572
2573               if (dump_enabled_p ())
2574                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2575                                  "Two or more load stmts share the same dr.\n");
2576
2577               /* For load use the same data-ref load.  */
2578               DR_GROUP_SAME_DR_STMT (next) = prev;
2579
2580               prev = next;
2581               next = DR_GROUP_NEXT_ELEMENT (next);
2582               continue;
2583             }
2584
2585           prev = next;
2586           data_ref = STMT_VINFO_DATA_REF (next);
2587
2588           /* All group members have the same STEP by construction.  */
2589           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590
2591           /* Check that the distance between two accesses is equal to the type
2592              size. Otherwise, we have gaps.  */
2593           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2595           if (diff != 1)
2596             {
2597               /* FORNOW: SLP of accesses with gaps is not supported.  */
2598               slp_impossible = true;
2599               if (DR_IS_WRITE (data_ref))
2600                 {
2601                   if (dump_enabled_p ())
2602                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603                                      "interleaved store with gaps\n");
2604                   return false;
2605                 }
2606
2607               gaps += diff - 1;
2608             }
2609
2610           last_accessed_element += diff;
2611
2612           /* Store the gap from the previous member of the group. If there is no
2613              gap in the access, DR_GROUP_GAP is always 1.  */
2614           DR_GROUP_GAP (next) = diff;
2615
2616           prev_init = DR_INIT (data_ref);
2617           next = DR_GROUP_NEXT_ELEMENT (next);
2618           /* Count the number of data-refs in the chain.  */
2619           count++;
2620         }
2621
2622       if (groupsize == 0)
2623         groupsize = count + gaps;
2624
2625       /* This could be UINT_MAX but as we are generating code in a very
2626          inefficient way we have to cap earlier.  See PR78699 for example.  */
2627       if (groupsize > 4096)
2628         {
2629           if (dump_enabled_p ())
2630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                              "group is too large\n");
2632           return false;
2633         }
2634
2635       /* Check that the size of the interleaving is equal to count for stores,
2636          i.e., that there are no gaps.  */
2637       if (groupsize != count
2638           && !DR_IS_READ (dr))
2639         {
2640           if (dump_enabled_p ())
2641             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2642                              "interleaved store with gaps\n");
2643           return false;
2644         }
2645
2646       /* If there is a gap after the last load in the group it is the
2647          difference between the groupsize and the last accessed
2648          element.
2649          When there is no gap, this difference should be 0.  */
2650       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2651
2652       DR_GROUP_SIZE (stmt_info) = groupsize;
2653       if (dump_enabled_p ())
2654         {
2655           dump_printf_loc (MSG_NOTE, vect_location,
2656                            "Detected interleaving ");
2657           if (DR_IS_READ (dr))
2658             dump_printf (MSG_NOTE, "load ");
2659           else
2660             dump_printf (MSG_NOTE, "store ");
2661           dump_printf (MSG_NOTE, "of size %u starting with ",
2662                        (unsigned)groupsize);
2663           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2664           if (DR_GROUP_GAP (stmt_info) != 0)
2665             dump_printf_loc (MSG_NOTE, vect_location,
2666                              "There is a gap of %u elements after the group\n",
2667                              DR_GROUP_GAP (stmt_info));
2668         }
2669
2670       /* SLP: create an SLP data structure for every interleaving group of
2671          stores for further analysis in vect_analyse_slp.  */
2672       if (DR_IS_WRITE (dr) && !slp_impossible)
2673         {
2674           if (loop_vinfo)
2675             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2676           if (bb_vinfo)
2677             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2678         }
2679     }
2680
2681   return true;
2682 }
2683
2684 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2685    accesses of legal size, step, etc.  Detect gaps, single element
2686    interleaving, and other special cases. Set grouped access info.
2687    Collect groups of strided stores for further use in SLP analysis.  */
2688
2689 static bool
2690 vect_analyze_group_access (dr_vec_info *dr_info)
2691 {
2692   if (!vect_analyze_group_access_1 (dr_info))
2693     {
2694       /* Dissolve the group if present.  */
2695       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2696       while (stmt_info)
2697         {
2698           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2699           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2700           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2701           stmt_info = next;
2702         }
2703       return false;
2704     }
2705   return true;
2706 }
2707
2708 /* Analyze the access pattern of the data-reference DR_INFO.
2709    In case of non-consecutive accesses call vect_analyze_group_access() to
2710    analyze groups of accesses.  */
2711
2712 static bool
2713 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2714 {
2715   data_reference *dr = dr_info->dr;
2716   tree step = DR_STEP (dr);
2717   tree scalar_type = TREE_TYPE (DR_REF (dr));
2718   stmt_vec_info stmt_info = dr_info->stmt;
2719   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2720   struct loop *loop = NULL;
2721
2722   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2723     return true;
2724
2725   if (loop_vinfo)
2726     loop = LOOP_VINFO_LOOP (loop_vinfo);
2727
2728   if (loop_vinfo && !step)
2729     {
2730       if (dump_enabled_p ())
2731         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732                          "bad data-ref access in loop\n");
2733       return false;
2734     }
2735
2736   /* Allow loads with zero step in inner-loop vectorization.  */
2737   if (loop_vinfo && integer_zerop (step))
2738     {
2739       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2740       if (!nested_in_vect_loop_p (loop, stmt_info))
2741         return DR_IS_READ (dr);
2742       /* Allow references with zero step for outer loops marked
2743          with pragma omp simd only - it guarantees absence of
2744          loop-carried dependencies between inner loop iterations.  */
2745       if (loop->safelen < 2)
2746         {
2747           if (dump_enabled_p ())
2748             dump_printf_loc (MSG_NOTE, vect_location,
2749                              "zero step in inner loop of nest\n");
2750           return false;
2751         }
2752     }
2753
2754   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2755     {
2756       /* Interleaved accesses are not yet supported within outer-loop
2757         vectorization for references in the inner-loop.  */
2758       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2759
2760       /* For the rest of the analysis we use the outer-loop step.  */
2761       step = STMT_VINFO_DR_STEP (stmt_info);
2762       if (integer_zerop (step))
2763         {
2764           if (dump_enabled_p ())
2765             dump_printf_loc (MSG_NOTE, vect_location,
2766                              "zero step in outer loop.\n");
2767           return DR_IS_READ (dr);
2768         }
2769     }
2770
2771   /* Consecutive?  */
2772   if (TREE_CODE (step) == INTEGER_CST)
2773     {
2774       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2775       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2776           || (dr_step < 0
2777               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2778         {
2779           /* Mark that it is not interleaving.  */
2780           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2781           return true;
2782         }
2783     }
2784
2785   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2786     {
2787       if (dump_enabled_p ())
2788         dump_printf_loc (MSG_NOTE, vect_location,
2789                          "grouped access in outer loop.\n");
2790       return false;
2791     }
2792
2793
2794   /* Assume this is a DR handled by non-constant strided load case.  */
2795   if (TREE_CODE (step) != INTEGER_CST)
2796     return (STMT_VINFO_STRIDED_P (stmt_info)
2797             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2798                 || vect_analyze_group_access (dr_info)));
2799
2800   /* Not consecutive access - check if it's a part of interleaving group.  */
2801   return vect_analyze_group_access (dr_info);
2802 }
2803
2804 /* Compare two data-references DRA and DRB to group them into chunks
2805    suitable for grouping.  */
2806
2807 static int
2808 dr_group_sort_cmp (const void *dra_, const void *drb_)
2809 {
2810   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2811   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2812   int cmp;
2813
2814   /* Stabilize sort.  */
2815   if (dra == drb)
2816     return 0;
2817
2818   /* DRs in different loops never belong to the same group.  */
2819   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2820   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2821   if (loopa != loopb)
2822     return loopa->num < loopb->num ? -1 : 1;
2823
2824   /* Ordering of DRs according to base.  */
2825   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2826                                DR_BASE_ADDRESS (drb));
2827   if (cmp != 0)
2828     return cmp;
2829
2830   /* And according to DR_OFFSET.  */
2831   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2832   if (cmp != 0)
2833     return cmp;
2834
2835   /* Put reads before writes.  */
2836   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2837     return DR_IS_READ (dra) ? -1 : 1;
2838
2839   /* Then sort after access size.  */
2840   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2841                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2842   if (cmp != 0)
2843     return cmp;
2844
2845   /* And after step.  */
2846   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2847   if (cmp != 0)
2848     return cmp;
2849
2850   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2851   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2852   if (cmp == 0)
2853     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2854   return cmp;
2855 }
2856
2857 /* If OP is the result of a conversion, return the unconverted value,
2858    otherwise return null.  */
2859
2860 static tree
2861 strip_conversion (tree op)
2862 {
2863   if (TREE_CODE (op) != SSA_NAME)
2864     return NULL_TREE;
2865   gimple *stmt = SSA_NAME_DEF_STMT (op);
2866   if (!is_gimple_assign (stmt)
2867       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2868     return NULL_TREE;
2869   return gimple_assign_rhs1 (stmt);
2870 }
2871
2872 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2873    and STMT2_INFO being in a single group.  */
2874
2875 static bool
2876 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2877 {
2878   if (gimple_assign_single_p (stmt1_info->stmt))
2879     return gimple_assign_single_p (stmt2_info->stmt);
2880
2881   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2882   if (call1 && gimple_call_internal_p (call1))
2883     {
2884       /* Check for two masked loads or two masked stores.  */
2885       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2886       if (!call2 || !gimple_call_internal_p (call2))
2887         return false;
2888       internal_fn ifn = gimple_call_internal_fn (call1);
2889       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2890         return false;
2891       if (ifn != gimple_call_internal_fn (call2))
2892         return false;
2893
2894       /* Check that the masks are the same.  Cope with casts of masks,
2895          like those created by build_mask_conversion.  */
2896       tree mask1 = gimple_call_arg (call1, 2);
2897       tree mask2 = gimple_call_arg (call2, 2);
2898       if (!operand_equal_p (mask1, mask2, 0))
2899         {
2900           mask1 = strip_conversion (mask1);
2901           if (!mask1)
2902             return false;
2903           mask2 = strip_conversion (mask2);
2904           if (!mask2)
2905             return false;
2906           if (!operand_equal_p (mask1, mask2, 0))
2907             return false;
2908         }
2909       return true;
2910     }
2911
2912   return false;
2913 }
2914
2915 /* Function vect_analyze_data_ref_accesses.
2916
2917    Analyze the access pattern of all the data references in the loop.
2918
2919    FORNOW: the only access pattern that is considered vectorizable is a
2920            simple step 1 (consecutive) access.
2921
2922    FORNOW: handle only arrays and pointer accesses.  */
2923
2924 bool
2925 vect_analyze_data_ref_accesses (vec_info *vinfo)
2926 {
2927   unsigned int i;
2928   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2929   struct data_reference *dr;
2930
2931   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2932
2933   if (datarefs.is_empty ())
2934     return true;
2935
2936   /* Sort the array of datarefs to make building the interleaving chains
2937      linear.  Don't modify the original vector's order, it is needed for
2938      determining what dependencies are reversed.  */
2939   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2940   datarefs_copy.qsort (dr_group_sort_cmp);
2941
2942   /* Build the interleaving chains.  */
2943   for (i = 0; i < datarefs_copy.length () - 1;)
2944     {
2945       data_reference_p dra = datarefs_copy[i];
2946       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2947       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2948       stmt_vec_info lastinfo = NULL;
2949       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2950           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2951         {
2952           ++i;
2953           continue;
2954         }
2955       for (i = i + 1; i < datarefs_copy.length (); ++i)
2956         {
2957           data_reference_p drb = datarefs_copy[i];
2958           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2959           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2960           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2961               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2962             break;
2963
2964           /* ???  Imperfect sorting (non-compatible types, non-modulo
2965              accesses, same accesses) can lead to a group to be artificially
2966              split here as we don't just skip over those.  If it really
2967              matters we can push those to a worklist and re-iterate
2968              over them.  The we can just skip ahead to the next DR here.  */
2969
2970           /* DRs in a different loop should not be put into the same
2971              interleaving group.  */
2972           if (gimple_bb (DR_STMT (dra))->loop_father
2973               != gimple_bb (DR_STMT (drb))->loop_father)
2974             break;
2975
2976           /* Check that the data-refs have same first location (except init)
2977              and they are both either store or load (not load and store,
2978              not masked loads or stores).  */
2979           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2980               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2981                                         DR_BASE_ADDRESS (drb)) != 0
2982               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2983               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2984             break;
2985
2986           /* Check that the data-refs have the same constant size.  */
2987           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2988           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2989           if (!tree_fits_uhwi_p (sza)
2990               || !tree_fits_uhwi_p (szb)
2991               || !tree_int_cst_equal (sza, szb))
2992             break;
2993
2994           /* Check that the data-refs have the same step.  */
2995           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2996             break;
2997
2998           /* Check the types are compatible.
2999              ???  We don't distinguish this during sorting.  */
3000           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3001                                    TREE_TYPE (DR_REF (drb))))
3002             break;
3003
3004           /* Check that the DR_INITs are compile-time constants.  */
3005           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3006               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3007             break;
3008
3009           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3010           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3011           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3012           HOST_WIDE_INT init_prev
3013             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3014           gcc_assert (init_a <= init_b
3015                       && init_a <= init_prev
3016                       && init_prev <= init_b);
3017
3018           /* Do not place the same access in the interleaving chain twice.  */
3019           if (init_b == init_prev)
3020             {
3021               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3022                           < gimple_uid (DR_STMT (drb)));
3023               /* ???  For now we simply "drop" the later reference which is
3024                  otherwise the same rather than finishing off this group.
3025                  In the end we'd want to re-process duplicates forming
3026                  multiple groups from the refs, likely by just collecting
3027                  all candidates (including duplicates and split points
3028                  below) in a vector and then process them together.  */
3029               continue;
3030             }
3031
3032           /* If init_b == init_a + the size of the type * k, we have an
3033              interleaving, and DRA is accessed before DRB.  */
3034           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3035           if (type_size_a == 0
3036               || (init_b - init_a) % type_size_a != 0)
3037             break;
3038
3039           /* If we have a store, the accesses are adjacent.  This splits
3040              groups into chunks we support (we don't support vectorization
3041              of stores with gaps).  */
3042           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3043             break;
3044
3045           /* If the step (if not zero or non-constant) is greater than the
3046              difference between data-refs' inits this splits groups into
3047              suitable sizes.  */
3048           if (tree_fits_shwi_p (DR_STEP (dra)))
3049             {
3050               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3051               if (step != 0 && step <= (init_b - init_a))
3052                 break;
3053             }
3054
3055           if (dump_enabled_p ())
3056             {
3057               dump_printf_loc (MSG_NOTE, vect_location,
3058                                "Detected interleaving ");
3059               if (DR_IS_READ (dra))
3060                 dump_printf (MSG_NOTE, "load ");
3061               else
3062                 dump_printf (MSG_NOTE, "store ");
3063               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3064               dump_printf (MSG_NOTE,  " and ");
3065               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3066               dump_printf (MSG_NOTE, "\n");
3067             }
3068
3069           /* Link the found element into the group list.  */
3070           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3071             {
3072               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3073               lastinfo = stmtinfo_a;
3074             }
3075           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3076           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3077           lastinfo = stmtinfo_b;
3078         }
3079     }
3080
3081   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3082     {
3083       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3084       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3085           && !vect_analyze_data_ref_access (dr_info))
3086         {
3087           if (dump_enabled_p ())
3088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3089                              "not vectorized: complicated access pattern.\n");
3090
3091           if (is_a <bb_vec_info> (vinfo))
3092             {
3093               /* Mark the statement as not vectorizable.  */
3094               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3095               continue;
3096             }
3097           else
3098             {
3099               datarefs_copy.release ();
3100               return false;
3101             }
3102         }
3103     }
3104
3105   datarefs_copy.release ();
3106   return true;
3107 }
3108
3109 /* Function vect_vfa_segment_size.
3110
3111    Input:
3112      DR_INFO: The data reference.
3113      LENGTH_FACTOR: segment length to consider.
3114
3115    Return a value suitable for the dr_with_seg_len::seg_len field.
3116    This is the "distance travelled" by the pointer from the first
3117    iteration in the segment to the last.  Note that it does not include
3118    the size of the access; in effect it only describes the first byte.  */
3119
3120 static tree
3121 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3122 {
3123   length_factor = size_binop (MINUS_EXPR,
3124                               fold_convert (sizetype, length_factor),
3125                               size_one_node);
3126   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3127                      length_factor);
3128 }
3129
3130 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3131    gives the worst-case number of bytes covered by the segment.  */
3132
3133 static unsigned HOST_WIDE_INT
3134 vect_vfa_access_size (dr_vec_info *dr_info)
3135 {
3136   stmt_vec_info stmt_vinfo = dr_info->stmt;
3137   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3138   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3139   unsigned HOST_WIDE_INT access_size = ref_size;
3140   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3141     {
3142       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3143       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3144     }
3145   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3146       && (vect_supportable_dr_alignment (dr_info, false)
3147           == dr_explicit_realign_optimized))
3148     {
3149       /* We might access a full vector's worth.  */
3150       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3151       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3152     }
3153   return access_size;
3154 }
3155
3156 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3157    describes.  */
3158
3159 static unsigned int
3160 vect_vfa_align (dr_vec_info *dr_info)
3161 {
3162   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3163 }
3164
3165 /* Function vect_no_alias_p.
3166
3167    Given data references A and B with equal base and offset, see whether
3168    the alias relation can be decided at compilation time.  Return 1 if
3169    it can and the references alias, 0 if it can and the references do
3170    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3171    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3172    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3173
3174 static int
3175 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3176                          tree segment_length_a, tree segment_length_b,
3177                          unsigned HOST_WIDE_INT access_size_a,
3178                          unsigned HOST_WIDE_INT access_size_b)
3179 {
3180   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3181   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3182   poly_uint64 const_length_a;
3183   poly_uint64 const_length_b;
3184
3185   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3186      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3187      [a, a+12) */
3188   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3189     {
3190       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3191       offset_a = (offset_a + access_size_a) - const_length_a;
3192     }
3193   else
3194     const_length_a = tree_to_poly_uint64 (segment_length_a);
3195   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3196     {
3197       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3198       offset_b = (offset_b + access_size_b) - const_length_b;
3199     }
3200   else
3201     const_length_b = tree_to_poly_uint64 (segment_length_b);
3202
3203   const_length_a += access_size_a;
3204   const_length_b += access_size_b;
3205
3206   if (ranges_known_overlap_p (offset_a, const_length_a,
3207                               offset_b, const_length_b))
3208     return 1;
3209
3210   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3211                                offset_b, const_length_b))
3212     return 0;
3213
3214   return -1;
3215 }
3216
3217 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3218    in DDR is >= VF.  */
3219
3220 static bool
3221 dependence_distance_ge_vf (data_dependence_relation *ddr,
3222                            unsigned int loop_depth, poly_uint64 vf)
3223 {
3224   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3225       || DDR_NUM_DIST_VECTS (ddr) == 0)
3226     return false;
3227
3228   /* If the dependence is exact, we should have limited the VF instead.  */
3229   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3230
3231   unsigned int i;
3232   lambda_vector dist_v;
3233   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3234     {
3235       HOST_WIDE_INT dist = dist_v[loop_depth];
3236       if (dist != 0
3237           && !(dist > 0 && DDR_REVERSED_P (ddr))
3238           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3239         return false;
3240     }
3241
3242   if (dump_enabled_p ())
3243     {
3244       dump_printf_loc (MSG_NOTE, vect_location,
3245                        "dependence distance between ");
3246       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3247       dump_printf (MSG_NOTE,  " and ");
3248       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3249       dump_printf (MSG_NOTE,  " is >= VF\n");
3250     }
3251
3252   return true;
3253 }
3254
3255 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3256
3257 static void
3258 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3259 {
3260   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3261   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3262   dump_printf (dump_kind, ") >= ");
3263   dump_dec (dump_kind, lower_bound.min_value);
3264 }
3265
3266 /* Record that the vectorized loop requires the vec_lower_bound described
3267    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3268
3269 static void
3270 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3271                         poly_uint64 min_value)
3272 {
3273   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3274   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3275     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3276       {
3277         unsigned_p &= lower_bounds[i].unsigned_p;
3278         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3279         if (lower_bounds[i].unsigned_p != unsigned_p
3280             || maybe_lt (lower_bounds[i].min_value, min_value))
3281           {
3282             lower_bounds[i].unsigned_p = unsigned_p;
3283             lower_bounds[i].min_value = min_value;
3284             if (dump_enabled_p ())
3285               {
3286                 dump_printf_loc (MSG_NOTE, vect_location,
3287                                  "updating run-time check to ");
3288                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3289                 dump_printf (MSG_NOTE, "\n");
3290               }
3291           }
3292         return;
3293       }
3294
3295   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3296   if (dump_enabled_p ())
3297     {
3298       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3299       dump_lower_bound (MSG_NOTE, lower_bound);
3300       dump_printf (MSG_NOTE, "\n");
3301     }
3302   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3303 }
3304
3305 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3306    will span fewer than GAP bytes.  */
3307
3308 static bool
3309 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3310                   poly_int64 gap)
3311 {
3312   stmt_vec_info stmt_info = dr_info->stmt;
3313   HOST_WIDE_INT count
3314     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3315   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3316     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3317   return (estimated_poly_value (gap)
3318           <= count * vect_get_scalar_dr_size (dr_info));
3319 }
3320
3321 /* Return true if we know that there is no alias between DR_INFO_A and
3322    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3323    When returning true, set *LOWER_BOUND_OUT to this N.  */
3324
3325 static bool
3326 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3327                                 poly_uint64 *lower_bound_out)
3328 {
3329   /* Check that there is a constant gap of known sign between DR_A
3330      and DR_B.  */
3331   data_reference *dr_a = dr_info_a->dr;
3332   data_reference *dr_b = dr_info_b->dr;
3333   poly_int64 init_a, init_b;
3334   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3335       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3336       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3337       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3338       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3339       || !ordered_p (init_a, init_b))
3340     return false;
3341
3342   /* Sort DR_A and DR_B by the address they access.  */
3343   if (maybe_lt (init_b, init_a))
3344     {
3345       std::swap (init_a, init_b);
3346       std::swap (dr_info_a, dr_info_b);
3347       std::swap (dr_a, dr_b);
3348     }
3349
3350   /* If the two accesses could be dependent within a scalar iteration,
3351      make sure that we'd retain their order.  */
3352   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3353       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3354     return false;
3355
3356   /* There is no alias if abs (DR_STEP) is greater than or equal to
3357      the bytes spanned by the combination of the two accesses.  */
3358   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3359   return true;
3360 }
3361
3362 /* Function vect_prune_runtime_alias_test_list.
3363
3364    Prune a list of ddrs to be tested at run-time by versioning for alias.
3365    Merge several alias checks into one if possible.
3366    Return FALSE if resulting list of ddrs is longer then allowed by
3367    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3368
3369 bool
3370 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3371 {
3372   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3373   hash_set <tree_pair_hash> compared_objects;
3374
3375   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3376   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3377     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3378   vec<vec_object_pair> &check_unequal_addrs
3379     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3380   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3381   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3382
3383   ddr_p ddr;
3384   unsigned int i;
3385   tree length_factor;
3386
3387   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3388
3389   /* Step values are irrelevant for aliasing if the number of vector
3390      iterations is equal to the number of scalar iterations (which can
3391      happen for fully-SLP loops).  */
3392   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3393
3394   if (!ignore_step_p)
3395     {
3396       /* Convert the checks for nonzero steps into bound tests.  */
3397       tree value;
3398       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3399         vect_check_lower_bound (loop_vinfo, value, true, 1);
3400     }
3401
3402   if (may_alias_ddrs.is_empty ())
3403     return true;
3404
3405   comp_alias_ddrs.create (may_alias_ddrs.length ());
3406
3407   unsigned int loop_depth
3408     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3409                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3410
3411   /* First, we collect all data ref pairs for aliasing checks.  */
3412   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3413     {
3414       int comp_res;
3415       poly_uint64 lower_bound;
3416       tree segment_length_a, segment_length_b;
3417       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3418       unsigned int align_a, align_b;
3419
3420       /* Ignore the alias if the VF we chose ended up being no greater
3421          than the dependence distance.  */
3422       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3423         continue;
3424
3425       if (DDR_OBJECT_A (ddr))
3426         {
3427           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3428           if (!compared_objects.add (new_pair))
3429             {
3430               if (dump_enabled_p ())
3431                 {
3432                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3433                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3434                   dump_printf (MSG_NOTE, " and ");
3435                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3436                   dump_printf (MSG_NOTE, " have different addresses\n");
3437                 }
3438               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3439             }
3440           continue;
3441         }
3442
3443       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3444       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3445
3446       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3447       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3448
3449       /* Skip the pair if inter-iteration dependencies are irrelevant
3450          and intra-iteration dependencies are guaranteed to be honored.  */
3451       if (ignore_step_p
3452           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3453               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3454                                                  &lower_bound)))
3455         {
3456           if (dump_enabled_p ())
3457             {
3458               dump_printf_loc (MSG_NOTE, vect_location,
3459                                "no need for alias check between ");
3460               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3461               dump_printf (MSG_NOTE, " and ");
3462               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3463               dump_printf (MSG_NOTE, " when VF is 1\n");
3464             }
3465           continue;
3466         }
3467
3468       /* See whether we can handle the alias using a bounds check on
3469          the step, and whether that's likely to be the best approach.
3470          (It might not be, for example, if the minimum step is much larger
3471          than the number of bytes handled by one vector iteration.)  */
3472       if (!ignore_step_p
3473           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3474           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3475                                              &lower_bound)
3476           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3477               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3478         {
3479           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3480           if (dump_enabled_p ())
3481             {
3482               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3483               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3484               dump_printf (MSG_NOTE, " and ");
3485               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3486               dump_printf (MSG_NOTE, " when the step ");
3487               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3488               dump_printf (MSG_NOTE, " is outside ");
3489               if (unsigned_p)
3490                 dump_printf (MSG_NOTE, "[0");
3491               else
3492                 {
3493                   dump_printf (MSG_NOTE, "(");
3494                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3495                 }
3496               dump_printf (MSG_NOTE, ", ");
3497               dump_dec (MSG_NOTE, lower_bound);
3498               dump_printf (MSG_NOTE, ")\n");
3499             }
3500           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3501                                   unsigned_p, lower_bound);
3502           continue;
3503         }
3504
3505       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3506       if (dr_group_first_a)
3507         {
3508           stmt_info_a = dr_group_first_a;
3509           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3510         }
3511
3512       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3513       if (dr_group_first_b)
3514         {
3515           stmt_info_b = dr_group_first_b;
3516           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3517         }
3518
3519       if (ignore_step_p)
3520         {
3521           segment_length_a = size_zero_node;
3522           segment_length_b = size_zero_node;
3523         }
3524       else
3525         {
3526           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3527                                 DR_STEP (dr_info_b->dr), 0))
3528             length_factor = scalar_loop_iters;
3529           else
3530             length_factor = size_int (vect_factor);
3531           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3532           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3533         }
3534       access_size_a = vect_vfa_access_size (dr_info_a);
3535       access_size_b = vect_vfa_access_size (dr_info_b);
3536       align_a = vect_vfa_align (dr_info_a);
3537       align_b = vect_vfa_align (dr_info_b);
3538
3539       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3540                                         DR_BASE_ADDRESS (dr_info_b->dr));
3541       if (comp_res == 0)
3542         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3543                                           DR_OFFSET (dr_info_b->dr));
3544
3545       /* See whether the alias is known at compilation time.  */
3546       if (comp_res == 0
3547           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3548           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3549           && poly_int_tree_p (segment_length_a)
3550           && poly_int_tree_p (segment_length_b))
3551         {
3552           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3553                                              segment_length_a,
3554                                              segment_length_b,
3555                                              access_size_a,
3556                                              access_size_b);
3557           if (res >= 0 && dump_enabled_p ())
3558             {
3559               dump_printf_loc (MSG_NOTE, vect_location,
3560                                "can tell at compile time that ");
3561               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3562               dump_printf (MSG_NOTE, " and ");
3563               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3564               if (res == 0)
3565                 dump_printf (MSG_NOTE, " do not alias\n");
3566               else
3567                 dump_printf (MSG_NOTE, " alias\n");
3568             }
3569
3570           if (res == 0)
3571             continue;
3572
3573           if (res == 1)
3574             {
3575               if (dump_enabled_p ())
3576                 dump_printf_loc (MSG_NOTE, vect_location,
3577                                  "not vectorized: compilation time alias.\n");
3578               return false;
3579             }
3580         }
3581
3582       dr_with_seg_len_pair_t dr_with_seg_len_pair
3583         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3584                           access_size_a, align_a),
3585          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3586                           access_size_b, align_b));
3587
3588       /* Canonicalize pairs by sorting the two DR members.  */
3589       if (comp_res > 0)
3590         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3591
3592       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3593     }
3594
3595   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3596
3597   unsigned int count = (comp_alias_ddrs.length ()
3598                         + check_unequal_addrs.length ());
3599
3600   dump_printf_loc (MSG_NOTE, vect_location,
3601                    "improved number of alias checks from %d to %d\n",
3602                    may_alias_ddrs.length (), count);
3603   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3604     {
3605       if (dump_enabled_p ())
3606         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607                          "number of versioning for alias "
3608                          "run-time tests exceeds %d "
3609                          "(--param vect-max-version-for-alias-checks)\n",
3610                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3611       return false;
3612     }
3613
3614   return true;
3615 }
3616
3617 /* Check whether we can use an internal function for a gather load
3618    or scatter store.  READ_P is true for loads and false for stores.
3619    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3620    the type of the memory elements being loaded or stored.  OFFSET_BITS
3621    is the number of bits in each scalar offset and OFFSET_SIGN is the
3622    sign of the offset.  SCALE is the amount by which the offset should
3623    be multiplied *after* it has been converted to address width.
3624
3625    Return true if the function is supported, storing the function
3626    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3627
3628 bool
3629 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3630                           tree memory_type, unsigned int offset_bits,
3631                           signop offset_sign, int scale,
3632                           internal_fn *ifn_out, tree *element_type_out)
3633 {
3634   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3635   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3636   if (offset_bits > element_bits)
3637     /* Internal functions require the offset to be the same width as
3638        the vector elements.  We can extend narrower offsets, but it isn't
3639        safe to truncate wider offsets.  */
3640     return false;
3641
3642   if (element_bits != memory_bits)
3643     /* For now the vector elements must be the same width as the
3644        memory elements.  */
3645     return false;
3646
3647   /* Work out which function we need.  */
3648   internal_fn ifn;
3649   if (read_p)
3650     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3651   else
3652     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3653
3654   /* Test whether the target supports this combination.  */
3655   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3656                                                offset_sign, scale))
3657     return false;
3658
3659   *ifn_out = ifn;
3660   *element_type_out = TREE_TYPE (vectype);
3661   return true;
3662 }
3663
3664 /* STMT_INFO is a call to an internal gather load or scatter store function.
3665    Describe the operation in INFO.  */
3666
3667 static void
3668 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3669                                    gather_scatter_info *info)
3670 {
3671   gcall *call = as_a <gcall *> (stmt_info->stmt);
3672   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3673   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674
3675   info->ifn = gimple_call_internal_fn (call);
3676   info->decl = NULL_TREE;
3677   info->base = gimple_call_arg (call, 0);
3678   info->offset = gimple_call_arg (call, 1);
3679   info->offset_dt = vect_unknown_def_type;
3680   info->offset_vectype = NULL_TREE;
3681   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3682   info->element_type = TREE_TYPE (vectype);
3683   info->memory_type = TREE_TYPE (DR_REF (dr));
3684 }
3685
3686 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3687    gather load or scatter store.  Describe the operation in *INFO if so.  */
3688
3689 bool
3690 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3691                            gather_scatter_info *info)
3692 {
3693   HOST_WIDE_INT scale = 1;
3694   poly_int64 pbitpos, pbitsize;
3695   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3696   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3697   tree offtype = NULL_TREE;
3698   tree decl = NULL_TREE, base, off;
3699   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3700   tree memory_type = TREE_TYPE (DR_REF (dr));
3701   machine_mode pmode;
3702   int punsignedp, reversep, pvolatilep = 0;
3703   internal_fn ifn;
3704   tree element_type;
3705   bool masked_p = false;
3706
3707   /* See whether this is already a call to a gather/scatter internal function.
3708      If not, see whether it's a masked load or store.  */
3709   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3710   if (call && gimple_call_internal_p (call))
3711     {
3712       ifn = gimple_call_internal_fn (call);
3713       if (internal_gather_scatter_fn_p (ifn))
3714         {
3715           vect_describe_gather_scatter_call (stmt_info, info);
3716           return true;
3717         }
3718       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3719     }
3720
3721   /* True if we should aim to use internal functions rather than
3722      built-in functions.  */
3723   bool use_ifn_p = (DR_IS_READ (dr)
3724                     ? supports_vec_gather_load_p ()
3725                     : supports_vec_scatter_store_p ());
3726
3727   base = DR_REF (dr);
3728   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3729      see if we can use the def stmt of the address.  */
3730   if (masked_p
3731       && TREE_CODE (base) == MEM_REF
3732       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3733       && integer_zerop (TREE_OPERAND (base, 1))
3734       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3735     {
3736       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3737       if (is_gimple_assign (def_stmt)
3738           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3739         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3740     }
3741
3742   /* The gather and scatter builtins need address of the form
3743      loop_invariant + vector * {1, 2, 4, 8}
3744      or
3745      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3746      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3747      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3748      multiplications and additions in it.  To get a vector, we need
3749      a single SSA_NAME that will be defined in the loop and will
3750      contain everything that is not loop invariant and that can be
3751      vectorized.  The following code attempts to find such a preexistng
3752      SSA_NAME OFF and put the loop invariants into a tree BASE
3753      that can be gimplified before the loop.  */
3754   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3755                               &punsignedp, &reversep, &pvolatilep);
3756   if (reversep)
3757     return false;
3758
3759   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3760
3761   if (TREE_CODE (base) == MEM_REF)
3762     {
3763       if (!integer_zerop (TREE_OPERAND (base, 1)))
3764         {
3765           if (off == NULL_TREE)
3766             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3767           else
3768             off = size_binop (PLUS_EXPR, off,
3769                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3770         }
3771       base = TREE_OPERAND (base, 0);
3772     }
3773   else
3774     base = build_fold_addr_expr (base);
3775
3776   if (off == NULL_TREE)
3777     off = size_zero_node;
3778
3779   /* If base is not loop invariant, either off is 0, then we start with just
3780      the constant offset in the loop invariant BASE and continue with base
3781      as OFF, otherwise give up.
3782      We could handle that case by gimplifying the addition of base + off
3783      into some SSA_NAME and use that as off, but for now punt.  */
3784   if (!expr_invariant_in_loop_p (loop, base))
3785     {
3786       if (!integer_zerop (off))
3787         return false;
3788       off = base;
3789       base = size_int (pbytepos);
3790     }
3791   /* Otherwise put base + constant offset into the loop invariant BASE
3792      and continue with OFF.  */
3793   else
3794     {
3795       base = fold_convert (sizetype, base);
3796       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3797     }
3798
3799   /* OFF at this point may be either a SSA_NAME or some tree expression
3800      from get_inner_reference.  Try to peel off loop invariants from it
3801      into BASE as long as possible.  */
3802   STRIP_NOPS (off);
3803   while (offtype == NULL_TREE)
3804     {
3805       enum tree_code code;
3806       tree op0, op1, add = NULL_TREE;
3807
3808       if (TREE_CODE (off) == SSA_NAME)
3809         {
3810           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3811
3812           if (expr_invariant_in_loop_p (loop, off))
3813             return false;
3814
3815           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3816             break;
3817
3818           op0 = gimple_assign_rhs1 (def_stmt);
3819           code = gimple_assign_rhs_code (def_stmt);
3820           op1 = gimple_assign_rhs2 (def_stmt);
3821         }
3822       else
3823         {
3824           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3825             return false;
3826           code = TREE_CODE (off);
3827           extract_ops_from_tree (off, &code, &op0, &op1);
3828         }
3829       switch (code)
3830         {
3831         case POINTER_PLUS_EXPR:
3832         case PLUS_EXPR:
3833           if (expr_invariant_in_loop_p (loop, op0))
3834             {
3835               add = op0;
3836               off = op1;
3837             do_add:
3838               add = fold_convert (sizetype, add);
3839               if (scale != 1)
3840                 add = size_binop (MULT_EXPR, add, size_int (scale));
3841               base = size_binop (PLUS_EXPR, base, add);
3842               continue;
3843             }
3844           if (expr_invariant_in_loop_p (loop, op1))
3845             {
3846               add = op1;
3847               off = op0;
3848               goto do_add;
3849             }
3850           break;
3851         case MINUS_EXPR:
3852           if (expr_invariant_in_loop_p (loop, op1))
3853             {
3854               add = fold_convert (sizetype, op1);
3855               add = size_binop (MINUS_EXPR, size_zero_node, add);
3856               off = op0;
3857               goto do_add;
3858             }
3859           break;
3860         case MULT_EXPR:
3861           if (scale == 1 && tree_fits_shwi_p (op1))
3862             {
3863               int new_scale = tree_to_shwi (op1);
3864               /* Only treat this as a scaling operation if the target
3865                  supports it.  */
3866               if (use_ifn_p
3867                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3868                                                 vectype, memory_type, 1,
3869                                                 TYPE_SIGN (TREE_TYPE (op0)),
3870                                                 new_scale, &ifn,
3871                                                 &element_type))
3872                 break;
3873               scale = new_scale;
3874               off = op0;
3875               continue;
3876             }
3877           break;
3878         case SSA_NAME:
3879           off = op0;
3880           continue;
3881         CASE_CONVERT:
3882           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3883               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3884             break;
3885           if (TYPE_PRECISION (TREE_TYPE (op0))
3886               == TYPE_PRECISION (TREE_TYPE (off)))
3887             {
3888               off = op0;
3889               continue;
3890             }
3891
3892           /* The internal functions need the offset to be the same width
3893              as the elements of VECTYPE.  Don't include operations that
3894              cast the offset from that width to a different width.  */
3895           if (use_ifn_p
3896               && (int_size_in_bytes (TREE_TYPE (vectype))
3897                   == int_size_in_bytes (TREE_TYPE (off))))
3898             break;
3899
3900           if (TYPE_PRECISION (TREE_TYPE (op0))
3901               < TYPE_PRECISION (TREE_TYPE (off)))
3902             {
3903               off = op0;
3904               offtype = TREE_TYPE (off);
3905               STRIP_NOPS (off);
3906               continue;
3907             }
3908           break;
3909         default:
3910           break;
3911         }
3912       break;
3913     }
3914
3915   /* If at the end OFF still isn't a SSA_NAME or isn't
3916      defined in the loop, punt.  */
3917   if (TREE_CODE (off) != SSA_NAME
3918       || expr_invariant_in_loop_p (loop, off))
3919     return false;
3920
3921   if (offtype == NULL_TREE)
3922     offtype = TREE_TYPE (off);
3923
3924   if (use_ifn_p)
3925     {
3926       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3927                                      memory_type, TYPE_PRECISION (offtype),
3928                                      TYPE_SIGN (offtype), scale, &ifn,
3929                                      &element_type))
3930         return false;
3931     }
3932   else
3933     {
3934       if (DR_IS_READ (dr))
3935         {
3936           if (targetm.vectorize.builtin_gather)
3937             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3938         }
3939       else
3940         {
3941           if (targetm.vectorize.builtin_scatter)
3942             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3943         }
3944
3945       if (!decl)
3946         return false;
3947
3948       ifn = IFN_LAST;
3949       element_type = TREE_TYPE (vectype);
3950     }
3951
3952   info->ifn = ifn;
3953   info->decl = decl;
3954   info->base = base;
3955   info->offset = off;
3956   info->offset_dt = vect_unknown_def_type;
3957   info->offset_vectype = NULL_TREE;
3958   info->scale = scale;
3959   info->element_type = element_type;
3960   info->memory_type = memory_type;
3961   return true;
3962 }
3963
3964 /* Find the data references in STMT, analyze them with respect to LOOP and
3965    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3966    be handled.  */
3967
3968 bool
3969 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3970                                vec<data_reference_p> *datarefs)
3971 {
3972   /* We can ignore clobbers for dataref analysis - they are removed during
3973      loop vectorization and BB vectorization checks dependences with a
3974      stmt walk.  */
3975   if (gimple_clobber_p (stmt))
3976     return true;
3977
3978   if (gimple_has_volatile_ops (stmt))
3979     {
3980       if (dump_enabled_p ())
3981         {
3982           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3983                            "not vectorized: volatile type ");
3984           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3985         }
3986       return false;
3987     }
3988
3989   if (stmt_can_throw_internal (stmt))
3990     {
3991       if (dump_enabled_p ())
3992         {
3993           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3994                            "not vectorized: statement can throw an "
3995                            "exception ");
3996           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3997         }
3998       return false;
3999     }
4000
4001   auto_vec<data_reference_p, 2> refs;
4002   if (!find_data_references_in_stmt (loop, stmt, &refs))
4003     return false;
4004
4005   if (refs.is_empty ())
4006     return true;
4007
4008   if (refs.length () > 1)
4009     {
4010       if (dump_enabled_p ())
4011         {
4012           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4013                            "not vectorized: more than one data ref "
4014                            "in stmt: ");
4015           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4016         }
4017       return false;
4018     }
4019
4020   if (gcall *call = dyn_cast <gcall *> (stmt))
4021     if (!gimple_call_internal_p (call)
4022         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4023             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4024       {
4025         if (dump_enabled_p ())
4026           {
4027             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4028                              "not vectorized: dr in a call ");
4029             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4030           }
4031         return false;
4032       }
4033
4034   data_reference_p dr = refs.pop ();
4035   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4036       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4037     {
4038       if (dump_enabled_p ())
4039         {
4040           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4041                            "not vectorized: statement is bitfield "
4042                            "access ");
4043           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4044         }
4045       return false;
4046     }
4047
4048   if (DR_BASE_ADDRESS (dr)
4049       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4050     {
4051       if (dump_enabled_p ())
4052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4053                          "not vectorized: base addr of dr is a "
4054                          "constant\n");
4055       return false;
4056     }
4057
4058   /* Check whether this may be a SIMD lane access and adjust the
4059      DR to make it easier for us to handle it.  */
4060   if (loop
4061       && loop->simduid
4062       && (!DR_BASE_ADDRESS (dr)
4063           || !DR_OFFSET (dr)
4064           || !DR_INIT (dr)
4065           || !DR_STEP (dr)))
4066     {
4067       struct data_reference *newdr
4068         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4069                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4070       if (DR_BASE_ADDRESS (newdr)
4071           && DR_OFFSET (newdr)
4072           && DR_INIT (newdr)
4073           && DR_STEP (newdr)
4074           && integer_zerop (DR_STEP (newdr)))
4075         {
4076           tree off = DR_OFFSET (newdr);
4077           STRIP_NOPS (off);
4078           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4079               && TREE_CODE (off) == MULT_EXPR
4080               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4081             {
4082               tree step = TREE_OPERAND (off, 1);
4083               off = TREE_OPERAND (off, 0);
4084               STRIP_NOPS (off);
4085               if (CONVERT_EXPR_P (off)
4086                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4087                       < TYPE_PRECISION (TREE_TYPE (off))))
4088                 off = TREE_OPERAND (off, 0);
4089               if (TREE_CODE (off) == SSA_NAME)
4090                 {
4091                   gimple *def = SSA_NAME_DEF_STMT (off);
4092                   tree reft = TREE_TYPE (DR_REF (newdr));
4093                   if (is_gimple_call (def)
4094                       && gimple_call_internal_p (def)
4095                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4096                     {
4097                       tree arg = gimple_call_arg (def, 0);
4098                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4099                       arg = SSA_NAME_VAR (arg);
4100                       if (arg == loop->simduid
4101                           /* For now.  */
4102                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4103                         {
4104                           DR_OFFSET (newdr) = ssize_int (0);
4105                           DR_STEP (newdr) = step;
4106                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4107                           DR_STEP_ALIGNMENT (newdr)
4108                             = highest_pow2_factor (step);
4109                           /* Mark as simd-lane access.  */
4110                           newdr->aux = (void *)-1;
4111                           free_data_ref (dr);
4112                           datarefs->safe_push (newdr);
4113                           return true;
4114                         }
4115                     }
4116                 }
4117             }
4118         }
4119       free_data_ref (newdr);
4120     }
4121
4122   datarefs->safe_push (dr);
4123   return true;
4124 }
4125
4126 /* Function vect_analyze_data_refs.
4127
4128   Find all the data references in the loop or basic block.
4129
4130    The general structure of the analysis of data refs in the vectorizer is as
4131    follows:
4132    1- vect_analyze_data_refs(loop/bb): call
4133       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4134       in the loop/bb and their dependences.
4135    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4136    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4137    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4138
4139 */
4140
4141 bool
4142 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4143 {
4144   struct loop *loop = NULL;
4145   unsigned int i;
4146   struct data_reference *dr;
4147   tree scalar_type;
4148
4149   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4150
4151   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4152     loop = LOOP_VINFO_LOOP (loop_vinfo);
4153
4154   /* Go through the data-refs, check that the analysis succeeded.  Update
4155      pointer from stmt_vec_info struct to DR and vectype.  */
4156
4157   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4158   FOR_EACH_VEC_ELT (datarefs, i, dr)
4159     {
4160       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4161       poly_uint64 vf;
4162
4163       gcc_assert (DR_REF (dr));
4164       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4165       gcc_assert (!stmt_info->dr_aux.dr);
4166       stmt_info->dr_aux.dr = dr;
4167       stmt_info->dr_aux.stmt = stmt_info;
4168
4169       /* Check that analysis of the data-ref succeeded.  */
4170       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4171           || !DR_STEP (dr))
4172         {
4173           bool maybe_gather
4174             = DR_IS_READ (dr)
4175               && !TREE_THIS_VOLATILE (DR_REF (dr))
4176               && (targetm.vectorize.builtin_gather != NULL
4177                   || supports_vec_gather_load_p ());
4178           bool maybe_scatter
4179             = DR_IS_WRITE (dr)
4180               && !TREE_THIS_VOLATILE (DR_REF (dr))
4181               && (targetm.vectorize.builtin_scatter != NULL
4182                   || supports_vec_scatter_store_p ());
4183
4184           /* If target supports vector gather loads or scatter stores,
4185              see if they can't be used.  */
4186           if (is_a <loop_vec_info> (vinfo)
4187               && !nested_in_vect_loop_p (loop, stmt_info))
4188             {
4189               if (maybe_gather || maybe_scatter)
4190                 {
4191                   if (maybe_gather)
4192                     gatherscatter = GATHER;
4193                   else
4194                     gatherscatter = SCATTER;
4195                 }
4196             }
4197
4198           if (gatherscatter == SG_NONE)
4199             {
4200               if (dump_enabled_p ())
4201                 {
4202                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4203                                    "not vectorized: data ref analysis "
4204                                    "failed ");
4205                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4206                                     stmt_info->stmt, 0);
4207                 }
4208               if (is_a <bb_vec_info> (vinfo))
4209                 {
4210                   /* In BB vectorization the ref can still participate
4211                      in dependence analysis, we just can't vectorize it.  */
4212                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4213                   continue;
4214                 }
4215               return false;
4216             }
4217         }
4218
4219       /* See if this was detected as SIMD lane access.  */
4220       if (dr->aux == (void *)-1)
4221         {
4222           if (nested_in_vect_loop_p (loop, stmt_info))
4223             {
4224               if (dump_enabled_p ())
4225                 {
4226                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227                                    "not vectorized: data ref analysis "
4228                                    "failed ");
4229                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4230                                     stmt_info->stmt, 0);
4231                 }
4232               return false;
4233             }
4234           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4235         }
4236
4237       tree base = get_base_address (DR_REF (dr));
4238       if (base && VAR_P (base) && DECL_NONALIASED (base))
4239         {
4240           if (dump_enabled_p ())
4241             {
4242               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4243                                "not vectorized: base object not addressable "
4244                                "for stmt: ");
4245               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4246                                 stmt_info->stmt, 0);
4247             }
4248           if (is_a <bb_vec_info> (vinfo))
4249             {
4250               /* In BB vectorization the ref can still participate
4251                  in dependence analysis, we just can't vectorize it.  */
4252               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4253               continue;
4254             }
4255           return false;
4256         }
4257
4258       if (is_a <loop_vec_info> (vinfo)
4259           && DR_STEP (dr)
4260           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4261         {
4262           if (nested_in_vect_loop_p (loop, stmt_info))
4263             {
4264               if (dump_enabled_p ())
4265                 {
4266                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4267                                    "not vectorized: not suitable for strided "
4268                                    "load ");
4269                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4270                                     stmt_info->stmt, 0);
4271                 }
4272               return false;
4273             }
4274           STMT_VINFO_STRIDED_P (stmt_info) = true;
4275         }
4276
4277       /* Update DR field in stmt_vec_info struct.  */
4278
4279       /* If the dataref is in an inner-loop of the loop that is considered for
4280          for vectorization, we also want to analyze the access relative to
4281          the outer-loop (DR contains information only relative to the
4282          inner-most enclosing loop).  We do that by building a reference to the
4283          first location accessed by the inner-loop, and analyze it relative to
4284          the outer-loop.  */
4285       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4286         {
4287           /* Build a reference to the first location accessed by the
4288              inner loop: *(BASE + INIT + OFFSET).  By construction,
4289              this address must be invariant in the inner loop, so we
4290              can consider it as being used in the outer loop.  */
4291           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4292           tree offset = unshare_expr (DR_OFFSET (dr));
4293           tree init = unshare_expr (DR_INIT (dr));
4294           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4295                                           init, offset);
4296           tree init_addr = fold_build_pointer_plus (base, init_offset);
4297           tree init_ref = build_fold_indirect_ref (init_addr);
4298
4299           if (dump_enabled_p ())
4300             {
4301               dump_printf_loc (MSG_NOTE, vect_location,
4302                                "analyze in outer loop: ");
4303               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4304               dump_printf (MSG_NOTE, "\n");
4305             }
4306
4307           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4308                                      init_ref, loop))
4309             /* dr_analyze_innermost already explained the failure.  */
4310             return false;
4311
4312           if (dump_enabled_p ())
4313             {
4314               dump_printf_loc (MSG_NOTE, vect_location,
4315                                "\touter base_address: ");
4316               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4317                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4318               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4319               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4320                                  STMT_VINFO_DR_OFFSET (stmt_info));
4321               dump_printf (MSG_NOTE,
4322                            "\n\touter constant offset from base address: ");
4323               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4324                                  STMT_VINFO_DR_INIT (stmt_info));
4325               dump_printf (MSG_NOTE, "\n\touter step: ");
4326               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4327                                  STMT_VINFO_DR_STEP (stmt_info));
4328               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4329                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4330               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4331                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4332               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4333                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4334               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4335                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4336             }
4337         }
4338
4339       /* Set vectype for STMT.  */
4340       scalar_type = TREE_TYPE (DR_REF (dr));
4341       STMT_VINFO_VECTYPE (stmt_info)
4342         = get_vectype_for_scalar_type (scalar_type);
4343       if (!STMT_VINFO_VECTYPE (stmt_info))
4344         {
4345           if (dump_enabled_p ())
4346             {
4347               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348                                "not vectorized: no vectype for stmt: ");
4349               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4350                                 stmt_info->stmt, 0);
4351               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4352               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4353                                  scalar_type);
4354               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4355             }
4356
4357           if (is_a <bb_vec_info> (vinfo))
4358             {
4359               /* No vector type is fine, the ref can still participate
4360                  in dependence analysis, we just can't vectorize it.  */
4361               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4362               continue;
4363             }
4364           return false;
4365         }
4366       else
4367         {
4368           if (dump_enabled_p ())
4369             {
4370               dump_printf_loc (MSG_NOTE, vect_location,
4371                                "got vectype for stmt: ");
4372               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4373               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4374                                  STMT_VINFO_VECTYPE (stmt_info));
4375               dump_printf (MSG_NOTE, "\n");
4376             }
4377         }
4378
4379       /* Adjust the minimal vectorization factor according to the
4380          vector type.  */
4381       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4382       *min_vf = upper_bound (*min_vf, vf);
4383
4384       if (gatherscatter != SG_NONE)
4385         {
4386           gather_scatter_info gs_info;
4387           if (!vect_check_gather_scatter (stmt_info,
4388                                           as_a <loop_vec_info> (vinfo),
4389                                           &gs_info)
4390               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4391             {
4392               if (dump_enabled_p ())
4393                 {
4394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395                                    (gatherscatter == GATHER) ?
4396                                    "not vectorized: not suitable for gather "
4397                                    "load " :
4398                                    "not vectorized: not suitable for scatter "
4399                                    "store ");
4400                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4401                                     stmt_info->stmt, 0);
4402                 }
4403               return false;
4404             }
4405           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4406         }
4407     }
4408
4409   /* We used to stop processing and prune the list here.  Verify we no
4410      longer need to.  */
4411   gcc_assert (i == datarefs.length ());
4412
4413   return true;
4414 }
4415
4416
4417 /* Function vect_get_new_vect_var.
4418
4419    Returns a name for a new variable.  The current naming scheme appends the
4420    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4421    the name of vectorizer generated variables, and appends that to NAME if
4422    provided.  */
4423
4424 tree
4425 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4426 {
4427   const char *prefix;
4428   tree new_vect_var;
4429
4430   switch (var_kind)
4431   {
4432   case vect_simple_var:
4433     prefix = "vect";
4434     break;
4435   case vect_scalar_var:
4436     prefix = "stmp";
4437     break;
4438   case vect_mask_var:
4439     prefix = "mask";
4440     break;
4441   case vect_pointer_var:
4442     prefix = "vectp";
4443     break;
4444   default:
4445     gcc_unreachable ();
4446   }
4447
4448   if (name)
4449     {
4450       char* tmp = concat (prefix, "_", name, NULL);
4451       new_vect_var = create_tmp_reg (type, tmp);
4452       free (tmp);
4453     }
4454   else
4455     new_vect_var = create_tmp_reg (type, prefix);
4456
4457   return new_vect_var;
4458 }
4459
4460 /* Like vect_get_new_vect_var but return an SSA name.  */
4461
4462 tree
4463 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4464 {
4465   const char *prefix;
4466   tree new_vect_var;
4467
4468   switch (var_kind)
4469   {
4470   case vect_simple_var:
4471     prefix = "vect";
4472     break;
4473   case vect_scalar_var:
4474     prefix = "stmp";
4475     break;
4476   case vect_pointer_var:
4477     prefix = "vectp";
4478     break;
4479   default:
4480     gcc_unreachable ();
4481   }
4482
4483   if (name)
4484     {
4485       char* tmp = concat (prefix, "_", name, NULL);
4486       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4487       free (tmp);
4488     }
4489   else
4490     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4491
4492   return new_vect_var;
4493 }
4494
4495 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4496
4497 static void
4498 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4499 {
4500   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4501   int misalign = DR_MISALIGNMENT (dr_info);
4502   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4503     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4504   else
4505     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4506                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4507 }
4508
4509 /* Function vect_create_addr_base_for_vector_ref.
4510
4511    Create an expression that computes the address of the first memory location
4512    that will be accessed for a data reference.
4513
4514    Input:
4515    STMT_INFO: The statement containing the data reference.
4516    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4517    OFFSET: Optional. If supplied, it is be added to the initial address.
4518    LOOP:    Specify relative to which loop-nest should the address be computed.
4519             For example, when the dataref is in an inner-loop nested in an
4520             outer-loop that is now being vectorized, LOOP can be either the
4521             outer-loop, or the inner-loop.  The first memory location accessed
4522             by the following dataref ('in' points to short):
4523
4524                 for (i=0; i<N; i++)
4525                    for (j=0; j<M; j++)
4526                      s += in[i+j]
4527
4528             is as follows:
4529             if LOOP=i_loop:     &in             (relative to i_loop)
4530             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4531    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4532             initial address.  Unlike OFFSET, which is number of elements to
4533             be added, BYTE_OFFSET is measured in bytes.
4534
4535    Output:
4536    1. Return an SSA_NAME whose value is the address of the memory location of
4537       the first vector of the data reference.
4538    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4539       these statement(s) which define the returned SSA_NAME.
4540
4541    FORNOW: We are only handling array accesses with step 1.  */
4542
4543 tree
4544 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4545                                       gimple_seq *new_stmt_list,
4546                                       tree offset,
4547                                       tree byte_offset)
4548 {
4549   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4550   struct data_reference *dr = dr_info->dr;
4551   const char *base_name;
4552   tree addr_base;
4553   tree dest;
4554   gimple_seq seq = NULL;
4555   tree vect_ptr_type;
4556   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4557   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4558   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4559
4560   tree data_ref_base = unshare_expr (drb->base_address);
4561   tree base_offset = unshare_expr (drb->offset);
4562   tree init = unshare_expr (drb->init);
4563
4564   if (loop_vinfo)
4565     base_name = get_name (data_ref_base);
4566   else
4567     {
4568       base_offset = ssize_int (0);
4569       init = ssize_int (0);
4570       base_name = get_name (DR_REF (dr));
4571     }
4572
4573   /* Create base_offset */
4574   base_offset = size_binop (PLUS_EXPR,
4575                             fold_convert (sizetype, base_offset),
4576                             fold_convert (sizetype, init));
4577
4578   if (offset)
4579     {
4580       offset = fold_build2 (MULT_EXPR, sizetype,
4581                             fold_convert (sizetype, offset), step);
4582       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4583                                  base_offset, offset);
4584     }
4585   if (byte_offset)
4586     {
4587       byte_offset = fold_convert (sizetype, byte_offset);
4588       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4589                                  base_offset, byte_offset);
4590     }
4591
4592   /* base + base_offset */
4593   if (loop_vinfo)
4594     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4595   else
4596     {
4597       addr_base = build1 (ADDR_EXPR,
4598                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4599                           unshare_expr (DR_REF (dr)));
4600     }
4601
4602   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4603   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4604   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4605   gimple_seq_add_seq (new_stmt_list, seq);
4606
4607   if (DR_PTR_INFO (dr)
4608       && TREE_CODE (addr_base) == SSA_NAME
4609       && !SSA_NAME_PTR_INFO (addr_base))
4610     {
4611       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4612       if (offset || byte_offset)
4613         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4614     }
4615
4616   if (dump_enabled_p ())
4617     {
4618       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4619       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4620       dump_printf (MSG_NOTE, "\n");
4621     }
4622
4623   return addr_base;
4624 }
4625
4626
4627 /* Function vect_create_data_ref_ptr.
4628
4629    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4630    location accessed in the loop by STMT_INFO, along with the def-use update
4631    chain to appropriately advance the pointer through the loop iterations.
4632    Also set aliasing information for the pointer.  This pointer is used by
4633    the callers to this function to create a memory reference expression for
4634    vector load/store access.
4635
4636    Input:
4637    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4638          GIMPLE_ASSIGN <name, data-ref> or
4639          GIMPLE_ASSIGN <data-ref, name>.
4640    2. AGGR_TYPE: the type of the reference, which should be either a vector
4641         or an array.
4642    3. AT_LOOP: the loop where the vector memref is to be created.
4643    4. OFFSET (optional): an offset to be added to the initial address accessed
4644         by the data-ref in STMT_INFO.
4645    5. BSI: location where the new stmts are to be placed if there is no loop
4646    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4647         pointing to the initial address.
4648    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4649         to the initial address accessed by the data-ref in STMT_INFO.  This is
4650         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4651         in bytes.
4652    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4653         to the IV during each iteration of the loop.  NULL says to move
4654         by one copy of AGGR_TYPE up or down, depending on the step of the
4655         data reference.
4656
4657    Output:
4658    1. Declare a new ptr to vector_type, and have it point to the base of the
4659       data reference (initial addressed accessed by the data reference).
4660       For example, for vector of type V8HI, the following code is generated:
4661
4662       v8hi *ap;
4663       ap = (v8hi *)initial_address;
4664
4665       if OFFSET is not supplied:
4666          initial_address = &a[init];
4667       if OFFSET is supplied:
4668          initial_address = &a[init + OFFSET];
4669       if BYTE_OFFSET is supplied:
4670          initial_address = &a[init] + BYTE_OFFSET;
4671
4672       Return the initial_address in INITIAL_ADDRESS.
4673
4674    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4675       update the pointer in each iteration of the loop.
4676
4677       Return the increment stmt that updates the pointer in PTR_INCR.
4678
4679    3. Set INV_P to true if the access pattern of the data reference in the
4680       vectorized loop is invariant.  Set it to false otherwise.
4681
4682    4. Return the pointer.  */
4683
4684 tree
4685 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4686                           struct loop *at_loop, tree offset,
4687                           tree *initial_address, gimple_stmt_iterator *gsi,
4688                           gimple **ptr_incr, bool only_init, bool *inv_p,
4689                           tree byte_offset, tree iv_step)
4690 {
4691   const char *base_name;
4692   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4693   struct loop *loop = NULL;
4694   bool nested_in_vect_loop = false;
4695   struct loop *containing_loop = NULL;
4696   tree aggr_ptr_type;
4697   tree aggr_ptr;
4698   tree new_temp;
4699   gimple_seq new_stmt_list = NULL;
4700   edge pe = NULL;
4701   basic_block new_bb;
4702   tree aggr_ptr_init;
4703   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4704   struct data_reference *dr = dr_info->dr;
4705   tree aptr;
4706   gimple_stmt_iterator incr_gsi;
4707   bool insert_after;
4708   tree indx_before_incr, indx_after_incr;
4709   gimple *incr;
4710   tree step;
4711   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4712
4713   gcc_assert (iv_step != NULL_TREE
4714               || TREE_CODE (aggr_type) == ARRAY_TYPE
4715               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4716
4717   if (loop_vinfo)
4718     {
4719       loop = LOOP_VINFO_LOOP (loop_vinfo);
4720       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4721       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4722       pe = loop_preheader_edge (loop);
4723     }
4724   else
4725     {
4726       gcc_assert (bb_vinfo);
4727       only_init = true;
4728       *ptr_incr = NULL;
4729     }
4730
4731   /* Check the step (evolution) of the load in LOOP, and record
4732      whether it's invariant.  */
4733   step = vect_dr_behavior (dr_info)->step;
4734   if (integer_zerop (step))
4735     *inv_p = true;
4736   else
4737     *inv_p = false;
4738
4739   /* Create an expression for the first address accessed by this load
4740      in LOOP.  */
4741   base_name = get_name (DR_BASE_ADDRESS (dr));
4742
4743   if (dump_enabled_p ())
4744     {
4745       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4746       dump_printf_loc (MSG_NOTE, vect_location,
4747                        "create %s-pointer variable to type: ",
4748                        get_tree_code_name (TREE_CODE (aggr_type)));
4749       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4750       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4751         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4752       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4753         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4754       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4755         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4756       else
4757         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4758       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4759       dump_printf (MSG_NOTE, "\n");
4760     }
4761
4762   /* (1) Create the new aggregate-pointer variable.
4763      Vector and array types inherit the alias set of their component
4764      type by default so we need to use a ref-all pointer if the data
4765      reference does not conflict with the created aggregated data
4766      reference because it is not addressable.  */
4767   bool need_ref_all = false;
4768   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4769                               get_alias_set (DR_REF (dr))))
4770     need_ref_all = true;
4771   /* Likewise for any of the data references in the stmt group.  */
4772   else if (DR_GROUP_SIZE (stmt_info) > 1)
4773     {
4774       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4775       do
4776         {
4777           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4778           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4779                                       get_alias_set (DR_REF (sdr))))
4780             {
4781               need_ref_all = true;
4782               break;
4783             }
4784           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4785         }
4786       while (sinfo);
4787     }
4788   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4789                                                need_ref_all);
4790   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4791
4792
4793   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4794      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4795      def-use update cycles for the pointer: one relative to the outer-loop
4796      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4797      to the inner-loop (which is the inner-most loop containing the dataref),
4798      and this is done be step (5) below.
4799
4800      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4801      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4802      redundant.  Steps (3),(4) create the following:
4803
4804         vp0 = &base_addr;
4805         LOOP:   vp1 = phi(vp0,vp2)
4806                 ...
4807                 ...
4808                 vp2 = vp1 + step
4809                 goto LOOP
4810
4811      If there is an inner-loop nested in loop, then step (5) will also be
4812      applied, and an additional update in the inner-loop will be created:
4813
4814         vp0 = &base_addr;
4815         LOOP:   vp1 = phi(vp0,vp2)
4816                 ...
4817         inner:     vp3 = phi(vp1,vp4)
4818                    vp4 = vp3 + inner_step
4819                    if () goto inner
4820                 ...
4821                 vp2 = vp1 + step
4822                 if () goto LOOP   */
4823
4824   /* (2) Calculate the initial address of the aggregate-pointer, and set
4825      the aggregate-pointer to point to it before the loop.  */
4826
4827   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4828
4829   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4830                                                    offset, byte_offset);
4831   if (new_stmt_list)
4832     {
4833       if (pe)
4834         {
4835           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4836           gcc_assert (!new_bb);
4837         }
4838       else
4839         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4840     }
4841
4842   *initial_address = new_temp;
4843   aggr_ptr_init = new_temp;
4844
4845   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4846      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4847      inner-loop nested in LOOP (during outer-loop vectorization).  */
4848
4849   /* No update in loop is required.  */
4850   if (only_init && (!loop_vinfo || at_loop == loop))
4851     aptr = aggr_ptr_init;
4852   else
4853     {
4854       if (iv_step == NULL_TREE)
4855         {
4856           /* The step of the aggregate pointer is the type size.  */
4857           iv_step = TYPE_SIZE_UNIT (aggr_type);
4858           /* One exception to the above is when the scalar step of the load in
4859              LOOP is zero. In this case the step here is also zero.  */
4860           if (*inv_p)
4861             iv_step = size_zero_node;
4862           else if (tree_int_cst_sgn (step) == -1)
4863             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4864         }
4865
4866       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4867
4868       create_iv (aggr_ptr_init,
4869                  fold_convert (aggr_ptr_type, iv_step),
4870                  aggr_ptr, loop, &incr_gsi, insert_after,
4871                  &indx_before_incr, &indx_after_incr);
4872       incr = gsi_stmt (incr_gsi);
4873       loop_vinfo->add_stmt (incr);
4874
4875       /* Copy the points-to information if it exists. */
4876       if (DR_PTR_INFO (dr))
4877         {
4878           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4879           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4880         }
4881       if (ptr_incr)
4882         *ptr_incr = incr;
4883
4884       aptr = indx_before_incr;
4885     }
4886
4887   if (!nested_in_vect_loop || only_init)
4888     return aptr;
4889
4890
4891   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4892      nested in LOOP, if exists.  */
4893
4894   gcc_assert (nested_in_vect_loop);
4895   if (!only_init)
4896     {
4897       standard_iv_increment_position (containing_loop, &incr_gsi,
4898                                       &insert_after);
4899       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4900                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4901                  &indx_after_incr);
4902       incr = gsi_stmt (incr_gsi);
4903       loop_vinfo->add_stmt (incr);
4904
4905       /* Copy the points-to information if it exists. */
4906       if (DR_PTR_INFO (dr))
4907         {
4908           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4909           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4910         }
4911       if (ptr_incr)
4912         *ptr_incr = incr;
4913
4914       return indx_before_incr;
4915     }
4916   else
4917     gcc_unreachable ();
4918 }
4919
4920
4921 /* Function bump_vector_ptr
4922
4923    Increment a pointer (to a vector type) by vector-size. If requested,
4924    i.e. if PTR-INCR is given, then also connect the new increment stmt
4925    to the existing def-use update-chain of the pointer, by modifying
4926    the PTR_INCR as illustrated below:
4927
4928    The pointer def-use update-chain before this function:
4929                         DATAREF_PTR = phi (p_0, p_2)
4930                         ....
4931         PTR_INCR:       p_2 = DATAREF_PTR + step
4932
4933    The pointer def-use update-chain after this function:
4934                         DATAREF_PTR = phi (p_0, p_2)
4935                         ....
4936                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4937                         ....
4938         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4939
4940    Input:
4941    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4942                  in the loop.
4943    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4944               the loop.  The increment amount across iterations is expected
4945               to be vector_size.
4946    BSI - location where the new update stmt is to be placed.
4947    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4948    BUMP - optional. The offset by which to bump the pointer. If not given,
4949           the offset is assumed to be vector_size.
4950
4951    Output: Return NEW_DATAREF_PTR as illustrated above.
4952
4953 */
4954
4955 tree
4956 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4957                  stmt_vec_info stmt_info, tree bump)
4958 {
4959   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4960   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4961   tree update = TYPE_SIZE_UNIT (vectype);
4962   gassign *incr_stmt;
4963   ssa_op_iter iter;
4964   use_operand_p use_p;
4965   tree new_dataref_ptr;
4966
4967   if (bump)
4968     update = bump;
4969
4970   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4971     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4972   else
4973     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4974   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4975                                    dataref_ptr, update);
4976   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4977
4978   /* Copy the points-to information if it exists. */
4979   if (DR_PTR_INFO (dr))
4980     {
4981       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4982       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4983     }
4984
4985   if (!ptr_incr)
4986     return new_dataref_ptr;
4987
4988   /* Update the vector-pointer's cross-iteration increment.  */
4989   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4990     {
4991       tree use = USE_FROM_PTR (use_p);
4992
4993       if (use == dataref_ptr)
4994         SET_USE (use_p, new_dataref_ptr);
4995       else
4996         gcc_assert (operand_equal_p (use, update, 0));
4997     }
4998
4999   return new_dataref_ptr;
5000 }
5001
5002
5003 /* Copy memory reference info such as base/clique from the SRC reference
5004    to the DEST MEM_REF.  */
5005
5006 void
5007 vect_copy_ref_info (tree dest, tree src)
5008 {
5009   if (TREE_CODE (dest) != MEM_REF)
5010     return;
5011
5012   tree src_base = src;
5013   while (handled_component_p (src_base))
5014     src_base = TREE_OPERAND (src_base, 0);
5015   if (TREE_CODE (src_base) != MEM_REF
5016       && TREE_CODE (src_base) != TARGET_MEM_REF)
5017     return;
5018
5019   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5020   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5021 }
5022
5023
5024 /* Function vect_create_destination_var.
5025
5026    Create a new temporary of type VECTYPE.  */
5027
5028 tree
5029 vect_create_destination_var (tree scalar_dest, tree vectype)
5030 {
5031   tree vec_dest;
5032   const char *name;
5033   char *new_name;
5034   tree type;
5035   enum vect_var_kind kind;
5036
5037   kind = vectype
5038     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5039     ? vect_mask_var
5040     : vect_simple_var
5041     : vect_scalar_var;
5042   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5043
5044   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5045
5046   name = get_name (scalar_dest);
5047   if (name)
5048     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5049   else
5050     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5051   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5052   free (new_name);
5053
5054   return vec_dest;
5055 }
5056
5057 /* Function vect_grouped_store_supported.
5058
5059    Returns TRUE if interleave high and interleave low permutations
5060    are supported, and FALSE otherwise.  */
5061
5062 bool
5063 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5064 {
5065   machine_mode mode = TYPE_MODE (vectype);
5066
5067   /* vect_permute_store_chain requires the group size to be equal to 3 or
5068      be a power of two.  */
5069   if (count != 3 && exact_log2 (count) == -1)
5070     {
5071       if (dump_enabled_p ())
5072         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5073                          "the size of the group of accesses"
5074                          " is not a power of 2 or not eqaul to 3\n");
5075       return false;
5076     }
5077
5078   /* Check that the permutation is supported.  */
5079   if (VECTOR_MODE_P (mode))
5080     {
5081       unsigned int i;
5082       if (count == 3)
5083         {
5084           unsigned int j0 = 0, j1 = 0, j2 = 0;
5085           unsigned int i, j;
5086
5087           unsigned int nelt;
5088           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5089             {
5090               if (dump_enabled_p ())
5091                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092                                  "cannot handle groups of 3 stores for"
5093                                  " variable-length vectors\n");
5094               return false;
5095             }
5096
5097           vec_perm_builder sel (nelt, nelt, 1);
5098           sel.quick_grow (nelt);
5099           vec_perm_indices indices;
5100           for (j = 0; j < 3; j++)
5101             {
5102               int nelt0 = ((3 - j) * nelt) % 3;
5103               int nelt1 = ((3 - j) * nelt + 1) % 3;
5104               int nelt2 = ((3 - j) * nelt + 2) % 3;
5105               for (i = 0; i < nelt; i++)
5106                 {
5107                   if (3 * i + nelt0 < nelt)
5108                     sel[3 * i + nelt0] = j0++;
5109                   if (3 * i + nelt1 < nelt)
5110                     sel[3 * i + nelt1] = nelt + j1++;
5111                   if (3 * i + nelt2 < nelt)
5112                     sel[3 * i + nelt2] = 0;
5113                 }
5114               indices.new_vector (sel, 2, nelt);
5115               if (!can_vec_perm_const_p (mode, indices))
5116                 {
5117                   if (dump_enabled_p ())
5118                     dump_printf (MSG_MISSED_OPTIMIZATION,
5119                                  "permutation op not supported by target.\n");
5120                   return false;
5121                 }
5122
5123               for (i = 0; i < nelt; i++)
5124                 {
5125                   if (3 * i + nelt0 < nelt)
5126                     sel[3 * i + nelt0] = 3 * i + nelt0;
5127                   if (3 * i + nelt1 < nelt)
5128                     sel[3 * i + nelt1] = 3 * i + nelt1;
5129                   if (3 * i + nelt2 < nelt)
5130                     sel[3 * i + nelt2] = nelt + j2++;
5131                 }
5132               indices.new_vector (sel, 2, nelt);
5133               if (!can_vec_perm_const_p (mode, indices))
5134                 {
5135                   if (dump_enabled_p ())
5136                     dump_printf (MSG_MISSED_OPTIMIZATION,
5137                                  "permutation op not supported by target.\n");
5138                   return false;
5139                 }
5140             }
5141           return true;
5142         }
5143       else
5144         {
5145           /* If length is not equal to 3 then only power of 2 is supported.  */
5146           gcc_assert (pow2p_hwi (count));
5147           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5148
5149           /* The encoding has 2 interleaved stepped patterns.  */
5150           vec_perm_builder sel (nelt, 2, 3);
5151           sel.quick_grow (6);
5152           for (i = 0; i < 3; i++)
5153             {
5154               sel[i * 2] = i;
5155               sel[i * 2 + 1] = i + nelt;
5156             }
5157           vec_perm_indices indices (sel, 2, nelt);
5158           if (can_vec_perm_const_p (mode, indices))
5159             {
5160               for (i = 0; i < 6; i++)
5161                 sel[i] += exact_div (nelt, 2);
5162               indices.new_vector (sel, 2, nelt);
5163               if (can_vec_perm_const_p (mode, indices))
5164                 return true;
5165             }
5166         }
5167     }
5168
5169   if (dump_enabled_p ())
5170     dump_printf (MSG_MISSED_OPTIMIZATION,
5171                  "permutaion op not supported by target.\n");
5172   return false;
5173 }
5174
5175
5176 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5177    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5178
5179 bool
5180 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5181                             bool masked_p)
5182 {
5183   if (masked_p)
5184     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5185                                          vec_mask_store_lanes_optab,
5186                                          vectype, count);
5187   else
5188     return vect_lanes_optab_supported_p ("vec_store_lanes",
5189                                          vec_store_lanes_optab,
5190                                          vectype, count);
5191 }
5192
5193
5194 /* Function vect_permute_store_chain.
5195
5196    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5197    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5198    the data correctly for the stores.  Return the final references for stores
5199    in RESULT_CHAIN.
5200
5201    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5202    The input is 4 vectors each containing 8 elements.  We assign a number to
5203    each element, the input sequence is:
5204
5205    1st vec:   0  1  2  3  4  5  6  7
5206    2nd vec:   8  9 10 11 12 13 14 15
5207    3rd vec:  16 17 18 19 20 21 22 23
5208    4th vec:  24 25 26 27 28 29 30 31
5209
5210    The output sequence should be:
5211
5212    1st vec:  0  8 16 24  1  9 17 25
5213    2nd vec:  2 10 18 26  3 11 19 27
5214    3rd vec:  4 12 20 28  5 13 21 30
5215    4th vec:  6 14 22 30  7 15 23 31
5216
5217    i.e., we interleave the contents of the four vectors in their order.
5218
5219    We use interleave_high/low instructions to create such output.  The input of
5220    each interleave_high/low operation is two vectors:
5221    1st vec    2nd vec
5222    0 1 2 3    4 5 6 7
5223    the even elements of the result vector are obtained left-to-right from the
5224    high/low elements of the first vector.  The odd elements of the result are
5225    obtained left-to-right from the high/low elements of the second vector.
5226    The output of interleave_high will be:   0 4 1 5
5227    and of interleave_low:                   2 6 3 7
5228
5229
5230    The permutation is done in log LENGTH stages.  In each stage interleave_high
5231    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5232    where the first argument is taken from the first half of DR_CHAIN and the
5233    second argument from it's second half.
5234    In our example,
5235
5236    I1: interleave_high (1st vec, 3rd vec)
5237    I2: interleave_low (1st vec, 3rd vec)
5238    I3: interleave_high (2nd vec, 4th vec)
5239    I4: interleave_low (2nd vec, 4th vec)
5240
5241    The output for the first stage is:
5242
5243    I1:  0 16  1 17  2 18  3 19
5244    I2:  4 20  5 21  6 22  7 23
5245    I3:  8 24  9 25 10 26 11 27
5246    I4: 12 28 13 29 14 30 15 31
5247
5248    The output of the second stage, i.e. the final result is:
5249
5250    I1:  0  8 16 24  1  9 17 25
5251    I2:  2 10 18 26  3 11 19 27
5252    I3:  4 12 20 28  5 13 21 30
5253    I4:  6 14 22 30  7 15 23 31.  */
5254
5255 void
5256 vect_permute_store_chain (vec<tree> dr_chain,
5257                           unsigned int length,
5258                           stmt_vec_info stmt_info,
5259                           gimple_stmt_iterator *gsi,
5260                           vec<tree> *result_chain)
5261 {
5262   tree vect1, vect2, high, low;
5263   gimple *perm_stmt;
5264   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5265   tree perm_mask_low, perm_mask_high;
5266   tree data_ref;
5267   tree perm3_mask_low, perm3_mask_high;
5268   unsigned int i, j, n, log_length = exact_log2 (length);
5269
5270   result_chain->quick_grow (length);
5271   memcpy (result_chain->address (), dr_chain.address (),
5272           length * sizeof (tree));
5273
5274   if (length == 3)
5275     {
5276       /* vect_grouped_store_supported ensures that this is constant.  */
5277       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5278       unsigned int j0 = 0, j1 = 0, j2 = 0;
5279
5280       vec_perm_builder sel (nelt, nelt, 1);
5281       sel.quick_grow (nelt);
5282       vec_perm_indices indices;
5283       for (j = 0; j < 3; j++)
5284         {
5285           int nelt0 = ((3 - j) * nelt) % 3;
5286           int nelt1 = ((3 - j) * nelt + 1) % 3;
5287           int nelt2 = ((3 - j) * nelt + 2) % 3;
5288
5289           for (i = 0; i < nelt; i++)
5290             {
5291               if (3 * i + nelt0 < nelt)
5292                 sel[3 * i + nelt0] = j0++;
5293               if (3 * i + nelt1 < nelt)
5294                 sel[3 * i + nelt1] = nelt + j1++;
5295               if (3 * i + nelt2 < nelt)
5296                 sel[3 * i + nelt2] = 0;
5297             }
5298           indices.new_vector (sel, 2, nelt);
5299           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5300
5301           for (i = 0; i < nelt; i++)
5302             {
5303               if (3 * i + nelt0 < nelt)
5304                 sel[3 * i + nelt0] = 3 * i + nelt0;
5305               if (3 * i + nelt1 < nelt)
5306                 sel[3 * i + nelt1] = 3 * i + nelt1;
5307               if (3 * i + nelt2 < nelt)
5308                 sel[3 * i + nelt2] = nelt + j2++;
5309             }
5310           indices.new_vector (sel, 2, nelt);
5311           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5312
5313           vect1 = dr_chain[0];
5314           vect2 = dr_chain[1];
5315
5316           /* Create interleaving stmt:
5317              low = VEC_PERM_EXPR <vect1, vect2,
5318                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5319                                    j + 2, nelt + j + 2, *, ...}>  */
5320           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5321           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5322                                            vect2, perm3_mask_low);
5323           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5324
5325           vect1 = data_ref;
5326           vect2 = dr_chain[2];
5327           /* Create interleaving stmt:
5328              low = VEC_PERM_EXPR <vect1, vect2,
5329                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5330                                    6, 7, nelt + j + 2, ...}>  */
5331           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5332           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5333                                            vect2, perm3_mask_high);
5334           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5335           (*result_chain)[j] = data_ref;
5336         }
5337     }
5338   else
5339     {
5340       /* If length is not equal to 3 then only power of 2 is supported.  */
5341       gcc_assert (pow2p_hwi (length));
5342
5343       /* The encoding has 2 interleaved stepped patterns.  */
5344       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5345       vec_perm_builder sel (nelt, 2, 3);
5346       sel.quick_grow (6);
5347       for (i = 0; i < 3; i++)
5348         {
5349           sel[i * 2] = i;
5350           sel[i * 2 + 1] = i + nelt;
5351         }
5352         vec_perm_indices indices (sel, 2, nelt);
5353         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5354
5355         for (i = 0; i < 6; i++)
5356           sel[i] += exact_div (nelt, 2);
5357         indices.new_vector (sel, 2, nelt);
5358         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5359
5360         for (i = 0, n = log_length; i < n; i++)
5361           {
5362             for (j = 0; j < length/2; j++)
5363               {
5364                 vect1 = dr_chain[j];
5365                 vect2 = dr_chain[j+length/2];
5366
5367                 /* Create interleaving stmt:
5368                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5369                                                         ...}>  */
5370                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5371                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5372                                                  vect2, perm_mask_high);
5373                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5374                 (*result_chain)[2*j] = high;
5375
5376                 /* Create interleaving stmt:
5377                    low = VEC_PERM_EXPR <vect1, vect2,
5378                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5379                                          ...}>  */
5380                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5381                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5382                                                  vect2, perm_mask_low);
5383                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5384                 (*result_chain)[2*j+1] = low;
5385               }
5386             memcpy (dr_chain.address (), result_chain->address (),
5387                     length * sizeof (tree));
5388           }
5389     }
5390 }
5391
5392 /* Function vect_setup_realignment
5393
5394    This function is called when vectorizing an unaligned load using
5395    the dr_explicit_realign[_optimized] scheme.
5396    This function generates the following code at the loop prolog:
5397
5398       p = initial_addr;
5399    x  msq_init = *(floor(p));   # prolog load
5400       realignment_token = call target_builtin;
5401     loop:
5402    x  msq = phi (msq_init, ---)
5403
5404    The stmts marked with x are generated only for the case of
5405    dr_explicit_realign_optimized.
5406
5407    The code above sets up a new (vector) pointer, pointing to the first
5408    location accessed by STMT_INFO, and a "floor-aligned" load using that
5409    pointer.  It also generates code to compute the "realignment-token"
5410    (if the relevant target hook was defined), and creates a phi-node at the
5411    loop-header bb whose arguments are the result of the prolog-load (created
5412    by this function) and the result of a load that takes place in the loop
5413    (to be created by the caller to this function).
5414
5415    For the case of dr_explicit_realign_optimized:
5416    The caller to this function uses the phi-result (msq) to create the
5417    realignment code inside the loop, and sets up the missing phi argument,
5418    as follows:
5419     loop:
5420       msq = phi (msq_init, lsq)
5421       lsq = *(floor(p'));        # load in loop
5422       result = realign_load (msq, lsq, realignment_token);
5423
5424    For the case of dr_explicit_realign:
5425     loop:
5426       msq = *(floor(p));        # load in loop
5427       p' = p + (VS-1);
5428       lsq = *(floor(p'));       # load in loop
5429       result = realign_load (msq, lsq, realignment_token);
5430
5431    Input:
5432    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5433                a memory location that may be unaligned.
5434    BSI - place where new code is to be inserted.
5435    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5436                               is used.
5437
5438    Output:
5439    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5440                        target hook, if defined.
5441    Return value - the result of the loop-header phi node.  */
5442
5443 tree
5444 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5445                         tree *realignment_token,
5446                         enum dr_alignment_support alignment_support_scheme,
5447                         tree init_addr,
5448                         struct loop **at_loop)
5449 {
5450   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5451   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5452   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5453   struct data_reference *dr = dr_info->dr;
5454   struct loop *loop = NULL;
5455   edge pe = NULL;
5456   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5457   tree vec_dest;
5458   gimple *inc;
5459   tree ptr;
5460   tree data_ref;
5461   basic_block new_bb;
5462   tree msq_init = NULL_TREE;
5463   tree new_temp;
5464   gphi *phi_stmt;
5465   tree msq = NULL_TREE;
5466   gimple_seq stmts = NULL;
5467   bool inv_p;
5468   bool compute_in_loop = false;
5469   bool nested_in_vect_loop = false;
5470   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5471   struct loop *loop_for_initial_load = NULL;
5472
5473   if (loop_vinfo)
5474     {
5475       loop = LOOP_VINFO_LOOP (loop_vinfo);
5476       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5477     }
5478
5479   gcc_assert (alignment_support_scheme == dr_explicit_realign
5480               || alignment_support_scheme == dr_explicit_realign_optimized);
5481
5482   /* We need to generate three things:
5483      1. the misalignment computation
5484      2. the extra vector load (for the optimized realignment scheme).
5485      3. the phi node for the two vectors from which the realignment is
5486       done (for the optimized realignment scheme).  */
5487
5488   /* 1. Determine where to generate the misalignment computation.
5489
5490      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5491      calculation will be generated by this function, outside the loop (in the
5492      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5493      caller, inside the loop.
5494
5495      Background: If the misalignment remains fixed throughout the iterations of
5496      the loop, then both realignment schemes are applicable, and also the
5497      misalignment computation can be done outside LOOP.  This is because we are
5498      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5499      are a multiple of VS (the Vector Size), and therefore the misalignment in
5500      different vectorized LOOP iterations is always the same.
5501      The problem arises only if the memory access is in an inner-loop nested
5502      inside LOOP, which is now being vectorized using outer-loop vectorization.
5503      This is the only case when the misalignment of the memory access may not
5504      remain fixed throughout the iterations of the inner-loop (as explained in
5505      detail in vect_supportable_dr_alignment).  In this case, not only is the
5506      optimized realignment scheme not applicable, but also the misalignment
5507      computation (and generation of the realignment token that is passed to
5508      REALIGN_LOAD) have to be done inside the loop.
5509
5510      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5511      or not, which in turn determines if the misalignment is computed inside
5512      the inner-loop, or outside LOOP.  */
5513
5514   if (init_addr != NULL_TREE || !loop_vinfo)
5515     {
5516       compute_in_loop = true;
5517       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5518     }
5519
5520
5521   /* 2. Determine where to generate the extra vector load.
5522
5523      For the optimized realignment scheme, instead of generating two vector
5524      loads in each iteration, we generate a single extra vector load in the
5525      preheader of the loop, and in each iteration reuse the result of the
5526      vector load from the previous iteration.  In case the memory access is in
5527      an inner-loop nested inside LOOP, which is now being vectorized using
5528      outer-loop vectorization, we need to determine whether this initial vector
5529      load should be generated at the preheader of the inner-loop, or can be
5530      generated at the preheader of LOOP.  If the memory access has no evolution
5531      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5532      to be generated inside LOOP (in the preheader of the inner-loop).  */
5533
5534   if (nested_in_vect_loop)
5535     {
5536       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5537       bool invariant_in_outerloop =
5538             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5539       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5540     }
5541   else
5542     loop_for_initial_load = loop;
5543   if (at_loop)
5544     *at_loop = loop_for_initial_load;
5545
5546   if (loop_for_initial_load)
5547     pe = loop_preheader_edge (loop_for_initial_load);
5548
5549   /* 3. For the case of the optimized realignment, create the first vector
5550       load at the loop preheader.  */
5551
5552   if (alignment_support_scheme == dr_explicit_realign_optimized)
5553     {
5554       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5555       gassign *new_stmt;
5556
5557       gcc_assert (!compute_in_loop);
5558       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5559       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5560                                       loop_for_initial_load, NULL_TREE,
5561                                       &init_addr, NULL, &inc, true, &inv_p);
5562       if (TREE_CODE (ptr) == SSA_NAME)
5563         new_temp = copy_ssa_name (ptr);
5564       else
5565         new_temp = make_ssa_name (TREE_TYPE (ptr));
5566       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5567       new_stmt = gimple_build_assign
5568                    (new_temp, BIT_AND_EXPR, ptr,
5569                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5570       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5571       gcc_assert (!new_bb);
5572       data_ref
5573         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5574                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5575       vect_copy_ref_info (data_ref, DR_REF (dr));
5576       new_stmt = gimple_build_assign (vec_dest, data_ref);
5577       new_temp = make_ssa_name (vec_dest, new_stmt);
5578       gimple_assign_set_lhs (new_stmt, new_temp);
5579       if (pe)
5580         {
5581           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5582           gcc_assert (!new_bb);
5583         }
5584       else
5585          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5586
5587       msq_init = gimple_assign_lhs (new_stmt);
5588     }
5589
5590   /* 4. Create realignment token using a target builtin, if available.
5591       It is done either inside the containing loop, or before LOOP (as
5592       determined above).  */
5593
5594   if (targetm.vectorize.builtin_mask_for_load)
5595     {
5596       gcall *new_stmt;
5597       tree builtin_decl;
5598
5599       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5600       if (!init_addr)
5601         {
5602           /* Generate the INIT_ADDR computation outside LOOP.  */
5603           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5604                                                             NULL_TREE);
5605           if (loop)
5606             {
5607               pe = loop_preheader_edge (loop);
5608               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5609               gcc_assert (!new_bb);
5610             }
5611           else
5612              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5613         }
5614
5615       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5616       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5617       vec_dest =
5618         vect_create_destination_var (scalar_dest,
5619                                      gimple_call_return_type (new_stmt));
5620       new_temp = make_ssa_name (vec_dest, new_stmt);
5621       gimple_call_set_lhs (new_stmt, new_temp);
5622
5623       if (compute_in_loop)
5624         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5625       else
5626         {
5627           /* Generate the misalignment computation outside LOOP.  */
5628           pe = loop_preheader_edge (loop);
5629           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5630           gcc_assert (!new_bb);
5631         }
5632
5633       *realignment_token = gimple_call_lhs (new_stmt);
5634
5635       /* The result of the CALL_EXPR to this builtin is determined from
5636          the value of the parameter and no global variables are touched
5637          which makes the builtin a "const" function.  Requiring the
5638          builtin to have the "const" attribute makes it unnecessary
5639          to call mark_call_clobbered.  */
5640       gcc_assert (TREE_READONLY (builtin_decl));
5641     }
5642
5643   if (alignment_support_scheme == dr_explicit_realign)
5644     return msq;
5645
5646   gcc_assert (!compute_in_loop);
5647   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5648
5649
5650   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5651
5652   pe = loop_preheader_edge (containing_loop);
5653   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5654   msq = make_ssa_name (vec_dest);
5655   phi_stmt = create_phi_node (msq, containing_loop->header);
5656   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5657
5658   return msq;
5659 }
5660
5661
5662 /* Function vect_grouped_load_supported.
5663
5664    COUNT is the size of the load group (the number of statements plus the
5665    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5666    only one statement, with a gap of COUNT - 1.
5667
5668    Returns true if a suitable permute exists.  */
5669
5670 bool
5671 vect_grouped_load_supported (tree vectype, bool single_element_p,
5672                              unsigned HOST_WIDE_INT count)
5673 {
5674   machine_mode mode = TYPE_MODE (vectype);
5675
5676   /* If this is single-element interleaving with an element distance
5677      that leaves unused vector loads around punt - we at least create
5678      very sub-optimal code in that case (and blow up memory,
5679      see PR65518).  */
5680   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5681     {
5682       if (dump_enabled_p ())
5683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5684                          "single-element interleaving not supported "
5685                          "for not adjacent vector loads\n");
5686       return false;
5687     }
5688
5689   /* vect_permute_load_chain requires the group size to be equal to 3 or
5690      be a power of two.  */
5691   if (count != 3 && exact_log2 (count) == -1)
5692     {
5693       if (dump_enabled_p ())
5694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695                          "the size of the group of accesses"
5696                          " is not a power of 2 or not equal to 3\n");
5697       return false;
5698     }
5699
5700   /* Check that the permutation is supported.  */
5701   if (VECTOR_MODE_P (mode))
5702     {
5703       unsigned int i, j;
5704       if (count == 3)
5705         {
5706           unsigned int nelt;
5707           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5708             {
5709               if (dump_enabled_p ())
5710                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5711                                  "cannot handle groups of 3 loads for"
5712                                  " variable-length vectors\n");
5713               return false;
5714             }
5715
5716           vec_perm_builder sel (nelt, nelt, 1);
5717           sel.quick_grow (nelt);
5718           vec_perm_indices indices;
5719           unsigned int k;
5720           for (k = 0; k < 3; k++)
5721             {
5722               for (i = 0; i < nelt; i++)
5723                 if (3 * i + k < 2 * nelt)
5724                   sel[i] = 3 * i + k;
5725                 else
5726                   sel[i] = 0;
5727               indices.new_vector (sel, 2, nelt);
5728               if (!can_vec_perm_const_p (mode, indices))
5729                 {
5730                   if (dump_enabled_p ())
5731                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5732                                      "shuffle of 3 loads is not supported by"
5733                                      " target\n");
5734                   return false;
5735                 }
5736               for (i = 0, j = 0; i < nelt; i++)
5737                 if (3 * i + k < 2 * nelt)
5738                   sel[i] = i;
5739                 else
5740                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5741               indices.new_vector (sel, 2, nelt);
5742               if (!can_vec_perm_const_p (mode, indices))
5743                 {
5744                   if (dump_enabled_p ())
5745                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5746                                      "shuffle of 3 loads is not supported by"
5747                                      " target\n");
5748                   return false;
5749                 }
5750             }
5751           return true;
5752         }
5753       else
5754         {
5755           /* If length is not equal to 3 then only power of 2 is supported.  */
5756           gcc_assert (pow2p_hwi (count));
5757           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5758
5759           /* The encoding has a single stepped pattern.  */
5760           vec_perm_builder sel (nelt, 1, 3);
5761           sel.quick_grow (3);
5762           for (i = 0; i < 3; i++)
5763             sel[i] = i * 2;
5764           vec_perm_indices indices (sel, 2, nelt);
5765           if (can_vec_perm_const_p (mode, indices))
5766             {
5767               for (i = 0; i < 3; i++)
5768                 sel[i] = i * 2 + 1;
5769               indices.new_vector (sel, 2, nelt);
5770               if (can_vec_perm_const_p (mode, indices))
5771                 return true;
5772             }
5773         }
5774     }
5775
5776   if (dump_enabled_p ())
5777     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778                      "extract even/odd not supported by target\n");
5779   return false;
5780 }
5781
5782 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5783    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5784
5785 bool
5786 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5787                            bool masked_p)
5788 {
5789   if (masked_p)
5790     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5791                                          vec_mask_load_lanes_optab,
5792                                          vectype, count);
5793   else
5794     return vect_lanes_optab_supported_p ("vec_load_lanes",
5795                                          vec_load_lanes_optab,
5796                                          vectype, count);
5797 }
5798
5799 /* Function vect_permute_load_chain.
5800
5801    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5802    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5803    the input data correctly.  Return the final references for loads in
5804    RESULT_CHAIN.
5805
5806    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5807    The input is 4 vectors each containing 8 elements. We assign a number to each
5808    element, the input sequence is:
5809
5810    1st vec:   0  1  2  3  4  5  6  7
5811    2nd vec:   8  9 10 11 12 13 14 15
5812    3rd vec:  16 17 18 19 20 21 22 23
5813    4th vec:  24 25 26 27 28 29 30 31
5814
5815    The output sequence should be:
5816
5817    1st vec:  0 4  8 12 16 20 24 28
5818    2nd vec:  1 5  9 13 17 21 25 29
5819    3rd vec:  2 6 10 14 18 22 26 30
5820    4th vec:  3 7 11 15 19 23 27 31
5821
5822    i.e., the first output vector should contain the first elements of each
5823    interleaving group, etc.
5824
5825    We use extract_even/odd instructions to create such output.  The input of
5826    each extract_even/odd operation is two vectors
5827    1st vec    2nd vec
5828    0 1 2 3    4 5 6 7
5829
5830    and the output is the vector of extracted even/odd elements.  The output of
5831    extract_even will be:   0 2 4 6
5832    and of extract_odd:     1 3 5 7
5833
5834
5835    The permutation is done in log LENGTH stages.  In each stage extract_even
5836    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5837    their order.  In our example,
5838
5839    E1: extract_even (1st vec, 2nd vec)
5840    E2: extract_odd (1st vec, 2nd vec)
5841    E3: extract_even (3rd vec, 4th vec)
5842    E4: extract_odd (3rd vec, 4th vec)
5843
5844    The output for the first stage will be:
5845
5846    E1:  0  2  4  6  8 10 12 14
5847    E2:  1  3  5  7  9 11 13 15
5848    E3: 16 18 20 22 24 26 28 30
5849    E4: 17 19 21 23 25 27 29 31
5850
5851    In order to proceed and create the correct sequence for the next stage (or
5852    for the correct output, if the second stage is the last one, as in our
5853    example), we first put the output of extract_even operation and then the
5854    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5855    The input for the second stage is:
5856
5857    1st vec (E1):  0  2  4  6  8 10 12 14
5858    2nd vec (E3): 16 18 20 22 24 26 28 30
5859    3rd vec (E2):  1  3  5  7  9 11 13 15
5860    4th vec (E4): 17 19 21 23 25 27 29 31
5861
5862    The output of the second stage:
5863
5864    E1: 0 4  8 12 16 20 24 28
5865    E2: 2 6 10 14 18 22 26 30
5866    E3: 1 5  9 13 17 21 25 29
5867    E4: 3 7 11 15 19 23 27 31
5868
5869    And RESULT_CHAIN after reordering:
5870
5871    1st vec (E1):  0 4  8 12 16 20 24 28
5872    2nd vec (E3):  1 5  9 13 17 21 25 29
5873    3rd vec (E2):  2 6 10 14 18 22 26 30
5874    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5875
5876 static void
5877 vect_permute_load_chain (vec<tree> dr_chain,
5878                          unsigned int length,
5879                          stmt_vec_info stmt_info,
5880                          gimple_stmt_iterator *gsi,
5881                          vec<tree> *result_chain)
5882 {
5883   tree data_ref, first_vect, second_vect;
5884   tree perm_mask_even, perm_mask_odd;
5885   tree perm3_mask_low, perm3_mask_high;
5886   gimple *perm_stmt;
5887   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5888   unsigned int i, j, log_length = exact_log2 (length);
5889
5890   result_chain->quick_grow (length);
5891   memcpy (result_chain->address (), dr_chain.address (),
5892           length * sizeof (tree));
5893
5894   if (length == 3)
5895     {
5896       /* vect_grouped_load_supported ensures that this is constant.  */
5897       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5898       unsigned int k;
5899
5900       vec_perm_builder sel (nelt, nelt, 1);
5901       sel.quick_grow (nelt);
5902       vec_perm_indices indices;
5903       for (k = 0; k < 3; k++)
5904         {
5905           for (i = 0; i < nelt; i++)
5906             if (3 * i + k < 2 * nelt)
5907               sel[i] = 3 * i + k;
5908             else
5909               sel[i] = 0;
5910           indices.new_vector (sel, 2, nelt);
5911           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5912
5913           for (i = 0, j = 0; i < nelt; i++)
5914             if (3 * i + k < 2 * nelt)
5915               sel[i] = i;
5916             else
5917               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5918           indices.new_vector (sel, 2, nelt);
5919           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5920
5921           first_vect = dr_chain[0];
5922           second_vect = dr_chain[1];
5923
5924           /* Create interleaving stmt (low part of):
5925              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5926                                                              ...}>  */
5927           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5928           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5929                                            second_vect, perm3_mask_low);
5930           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5931
5932           /* Create interleaving stmt (high part of):
5933              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5934                                                               ...}>  */
5935           first_vect = data_ref;
5936           second_vect = dr_chain[2];
5937           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5938           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5939                                            second_vect, perm3_mask_high);
5940           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5941           (*result_chain)[k] = data_ref;
5942         }
5943     }
5944   else
5945     {
5946       /* If length is not equal to 3 then only power of 2 is supported.  */
5947       gcc_assert (pow2p_hwi (length));
5948
5949       /* The encoding has a single stepped pattern.  */
5950       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5951       vec_perm_builder sel (nelt, 1, 3);
5952       sel.quick_grow (3);
5953       for (i = 0; i < 3; ++i)
5954         sel[i] = i * 2;
5955       vec_perm_indices indices (sel, 2, nelt);
5956       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5957
5958       for (i = 0; i < 3; ++i)
5959         sel[i] = i * 2 + 1;
5960       indices.new_vector (sel, 2, nelt);
5961       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5962
5963       for (i = 0; i < log_length; i++)
5964         {
5965           for (j = 0; j < length; j += 2)
5966             {
5967               first_vect = dr_chain[j];
5968               second_vect = dr_chain[j+1];
5969
5970               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5971               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5972               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5973                                                first_vect, second_vect,
5974                                                perm_mask_even);
5975               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5976               (*result_chain)[j/2] = data_ref;
5977
5978               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5979               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5980               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5981                                                first_vect, second_vect,
5982                                                perm_mask_odd);
5983               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5984               (*result_chain)[j/2+length/2] = data_ref;
5985             }
5986           memcpy (dr_chain.address (), result_chain->address (),
5987                   length * sizeof (tree));
5988         }
5989     }
5990 }
5991
5992 /* Function vect_shift_permute_load_chain.
5993
5994    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5995    sequence of stmts to reorder the input data accordingly.
5996    Return the final references for loads in RESULT_CHAIN.
5997    Return true if successed, false otherwise.
5998
5999    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6000    The input is 3 vectors each containing 8 elements.  We assign a
6001    number to each element, the input sequence is:
6002
6003    1st vec:   0  1  2  3  4  5  6  7
6004    2nd vec:   8  9 10 11 12 13 14 15
6005    3rd vec:  16 17 18 19 20 21 22 23
6006
6007    The output sequence should be:
6008
6009    1st vec:  0 3 6  9 12 15 18 21
6010    2nd vec:  1 4 7 10 13 16 19 22
6011    3rd vec:  2 5 8 11 14 17 20 23
6012
6013    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6014
6015    First we shuffle all 3 vectors to get correct elements order:
6016
6017    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6018    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6019    3rd vec:  (16 19 22) (17 20 23) (18 21)
6020
6021    Next we unite and shift vector 3 times:
6022
6023    1st step:
6024      shift right by 6 the concatenation of:
6025      "1st vec" and  "2nd vec"
6026        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6027      "2nd vec" and  "3rd vec"
6028        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6029      "3rd vec" and  "1st vec"
6030        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6031                              | New vectors                   |
6032
6033      So that now new vectors are:
6034
6035      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6036      2nd vec:  (10 13) (16 19 22) (17 20 23)
6037      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6038
6039    2nd step:
6040      shift right by 5 the concatenation of:
6041      "1st vec" and  "3rd vec"
6042        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6043      "2nd vec" and  "1st vec"
6044        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6045      "3rd vec" and  "2nd vec"
6046        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6047                           | New vectors                   |
6048
6049      So that now new vectors are:
6050
6051      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6052      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6053      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6054
6055    3rd step:
6056      shift right by 5 the concatenation of:
6057      "1st vec" and  "1st vec"
6058        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6059      shift right by 3 the concatenation of:
6060      "2nd vec" and  "2nd vec"
6061                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6062                           | New vectors                   |
6063
6064      So that now all vectors are READY:
6065      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6066      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6067      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6068
6069    This algorithm is faster than one in vect_permute_load_chain if:
6070      1.  "shift of a concatination" is faster than general permutation.
6071          This is usually so.
6072      2.  The TARGET machine can't execute vector instructions in parallel.
6073          This is because each step of the algorithm depends on previous.
6074          The algorithm in vect_permute_load_chain is much more parallel.
6075
6076    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6077 */
6078
6079 static bool
6080 vect_shift_permute_load_chain (vec<tree> dr_chain,
6081                                unsigned int length,
6082                                stmt_vec_info stmt_info,
6083                                gimple_stmt_iterator *gsi,
6084                                vec<tree> *result_chain)
6085 {
6086   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6087   tree perm2_mask1, perm2_mask2, perm3_mask;
6088   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6089   gimple *perm_stmt;
6090
6091   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6092   unsigned int i;
6093   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6094
6095   unsigned HOST_WIDE_INT nelt, vf;
6096   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6097       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6098     /* Not supported for variable-length vectors.  */
6099     return false;
6100
6101   vec_perm_builder sel (nelt, nelt, 1);
6102   sel.quick_grow (nelt);
6103
6104   result_chain->quick_grow (length);
6105   memcpy (result_chain->address (), dr_chain.address (),
6106           length * sizeof (tree));
6107
6108   if (pow2p_hwi (length) && vf > 4)
6109     {
6110       unsigned int j, log_length = exact_log2 (length);
6111       for (i = 0; i < nelt / 2; ++i)
6112         sel[i] = i * 2;
6113       for (i = 0; i < nelt / 2; ++i)
6114         sel[nelt / 2 + i] = i * 2 + 1;
6115       vec_perm_indices indices (sel, 2, nelt);
6116       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6117         {
6118           if (dump_enabled_p ())
6119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120                              "shuffle of 2 fields structure is not \
6121                               supported by target\n");
6122           return false;
6123         }
6124       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6125
6126       for (i = 0; i < nelt / 2; ++i)
6127         sel[i] = i * 2 + 1;
6128       for (i = 0; i < nelt / 2; ++i)
6129         sel[nelt / 2 + i] = i * 2;
6130       indices.new_vector (sel, 2, nelt);
6131       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132         {
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "shuffle of 2 fields structure is not \
6136                               supported by target\n");
6137           return false;
6138         }
6139       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6140
6141       /* Generating permutation constant to shift all elements.
6142          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6143       for (i = 0; i < nelt; i++)
6144         sel[i] = nelt / 2 + i;
6145       indices.new_vector (sel, 2, nelt);
6146       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6147         {
6148           if (dump_enabled_p ())
6149             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150                              "shift permutation is not supported by target\n");
6151           return false;
6152         }
6153       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6154
6155       /* Generating permutation constant to select vector from 2.
6156          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6157       for (i = 0; i < nelt / 2; i++)
6158         sel[i] = i;
6159       for (i = nelt / 2; i < nelt; i++)
6160         sel[i] = nelt + i;
6161       indices.new_vector (sel, 2, nelt);
6162       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6163         {
6164           if (dump_enabled_p ())
6165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166                              "select is not supported by target\n");
6167           return false;
6168         }
6169       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6170
6171       for (i = 0; i < log_length; i++)
6172         {
6173           for (j = 0; j < length; j += 2)
6174             {
6175               first_vect = dr_chain[j];
6176               second_vect = dr_chain[j + 1];
6177
6178               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6179               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6180                                                first_vect, first_vect,
6181                                                perm2_mask1);
6182               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6183               vect[0] = data_ref;
6184
6185               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6186               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187                                                second_vect, second_vect,
6188                                                perm2_mask2);
6189               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6190               vect[1] = data_ref;
6191
6192               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6193               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6194                                                vect[0], vect[1], shift1_mask);
6195               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6196               (*result_chain)[j/2 + length/2] = data_ref;
6197
6198               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6199               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6200                                                vect[0], vect[1], select_mask);
6201               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6202               (*result_chain)[j/2] = data_ref;
6203             }
6204           memcpy (dr_chain.address (), result_chain->address (),
6205                   length * sizeof (tree));
6206         }
6207       return true;
6208     }
6209   if (length == 3 && vf > 2)
6210     {
6211       unsigned int k = 0, l = 0;
6212
6213       /* Generating permutation constant to get all elements in rigth order.
6214          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6215       for (i = 0; i < nelt; i++)
6216         {
6217           if (3 * k + (l % 3) >= nelt)
6218             {
6219               k = 0;
6220               l += (3 - (nelt % 3));
6221             }
6222           sel[i] = 3 * k + (l % 3);
6223           k++;
6224         }
6225       vec_perm_indices indices (sel, 2, nelt);
6226       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6227         {
6228           if (dump_enabled_p ())
6229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230                              "shuffle of 3 fields structure is not \
6231                               supported by target\n");
6232           return false;
6233         }
6234       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6235
6236       /* Generating permutation constant to shift all elements.
6237          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6238       for (i = 0; i < nelt; i++)
6239         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6240       indices.new_vector (sel, 2, nelt);
6241       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6242         {
6243           if (dump_enabled_p ())
6244             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245                              "shift permutation is not supported by target\n");
6246           return false;
6247         }
6248       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6249
6250       /* Generating permutation constant to shift all elements.
6251          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6252       for (i = 0; i < nelt; i++)
6253         sel[i] = 2 * (nelt / 3) + 1 + i;
6254       indices.new_vector (sel, 2, nelt);
6255       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6256         {
6257           if (dump_enabled_p ())
6258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6259                              "shift permutation is not supported by target\n");
6260           return false;
6261         }
6262       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6263
6264       /* Generating permutation constant to shift all elements.
6265          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6266       for (i = 0; i < nelt; i++)
6267         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6268       indices.new_vector (sel, 2, nelt);
6269       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6270         {
6271           if (dump_enabled_p ())
6272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6273                              "shift permutation is not supported by target\n");
6274           return false;
6275         }
6276       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6277
6278       /* Generating permutation constant to shift all elements.
6279          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6280       for (i = 0; i < nelt; i++)
6281         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6282       indices.new_vector (sel, 2, nelt);
6283       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6284         {
6285           if (dump_enabled_p ())
6286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6287                              "shift permutation is not supported by target\n");
6288           return false;
6289         }
6290       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6291
6292       for (k = 0; k < 3; k++)
6293         {
6294           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6295           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6296                                            dr_chain[k], dr_chain[k],
6297                                            perm3_mask);
6298           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6299           vect[k] = data_ref;
6300         }
6301
6302       for (k = 0; k < 3; k++)
6303         {
6304           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6305           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306                                            vect[k % 3], vect[(k + 1) % 3],
6307                                            shift1_mask);
6308           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6309           vect_shift[k] = data_ref;
6310         }
6311
6312       for (k = 0; k < 3; k++)
6313         {
6314           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6315           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6316                                            vect_shift[(4 - k) % 3],
6317                                            vect_shift[(3 - k) % 3],
6318                                            shift2_mask);
6319           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6320           vect[k] = data_ref;
6321         }
6322
6323       (*result_chain)[3 - (nelt % 3)] = vect[2];
6324
6325       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6326       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6327                                        vect[0], shift3_mask);
6328       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6329       (*result_chain)[nelt % 3] = data_ref;
6330
6331       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6332       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6333                                        vect[1], shift4_mask);
6334       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6335       (*result_chain)[0] = data_ref;
6336       return true;
6337     }
6338   return false;
6339 }
6340
6341 /* Function vect_transform_grouped_load.
6342
6343    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6344    to perform their permutation and ascribe the result vectorized statements to
6345    the scalar statements.
6346 */
6347
6348 void
6349 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6350                              int size, gimple_stmt_iterator *gsi)
6351 {
6352   machine_mode mode;
6353   vec<tree> result_chain = vNULL;
6354
6355   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6356      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6357      vectors, that are ready for vector computation.  */
6358   result_chain.create (size);
6359
6360   /* If reassociation width for vector type is 2 or greater target machine can
6361      execute 2 or more vector instructions in parallel.  Otherwise try to
6362      get chain for loads group using vect_shift_permute_load_chain.  */
6363   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6364   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6365       || pow2p_hwi (size)
6366       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6367                                          gsi, &result_chain))
6368     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6369   vect_record_grouped_load_vectors (stmt_info, result_chain);
6370   result_chain.release ();
6371 }
6372
6373 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6374    generated as part of the vectorization of STMT_INFO.  Assign the statement
6375    for each vector to the associated scalar statement.  */
6376
6377 void
6378 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6379                                   vec<tree> result_chain)
6380 {
6381   vec_info *vinfo = stmt_info->vinfo;
6382   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6383   unsigned int i, gap_count;
6384   tree tmp_data_ref;
6385
6386   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6387      Since we scan the chain starting from it's first node, their order
6388      corresponds the order of data-refs in RESULT_CHAIN.  */
6389   stmt_vec_info next_stmt_info = first_stmt_info;
6390   gap_count = 1;
6391   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6392     {
6393       if (!next_stmt_info)
6394         break;
6395
6396       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6397        code elimination pass later.  No need to check for the first stmt in
6398        the group, since it always exists.
6399        DR_GROUP_GAP is the number of steps in elements from the previous
6400        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6401        correspond to the gaps.  */
6402       if (next_stmt_info != first_stmt_info
6403           && gap_count < DR_GROUP_GAP (next_stmt_info))
6404       {
6405         gap_count++;
6406         continue;
6407       }
6408
6409       while (next_stmt_info)
6410         {
6411           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6412           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6413              copies, and we put the new vector statement in the first available
6414              RELATED_STMT.  */
6415           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6416             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6417           else
6418             {
6419               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6420                 {
6421                   stmt_vec_info prev_stmt_info
6422                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6423                   stmt_vec_info rel_stmt_info
6424                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6425                   while (rel_stmt_info)
6426                     {
6427                       prev_stmt_info = rel_stmt_info;
6428                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6429                     }
6430
6431                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6432                 }
6433             }
6434
6435           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6436           gap_count = 1;
6437           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6438              put the same TMP_DATA_REF as its vectorized statement; otherwise
6439              get the next data-ref from RESULT_CHAIN.  */
6440           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6441             break;
6442         }
6443     }
6444 }
6445
6446 /* Function vect_force_dr_alignment_p.
6447
6448    Returns whether the alignment of a DECL can be forced to be aligned
6449    on ALIGNMENT bit boundary.  */
6450
6451 bool
6452 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6453 {
6454   if (!VAR_P (decl))
6455     return false;
6456
6457   if (decl_in_symtab_p (decl)
6458       && !symtab_node::get (decl)->can_increase_alignment_p ())
6459     return false;
6460
6461   if (TREE_STATIC (decl))
6462     return (alignment <= MAX_OFILE_ALIGNMENT);
6463   else
6464     return (alignment <= MAX_STACK_ALIGNMENT);
6465 }
6466
6467
6468 /* Return whether the data reference DR_INFO is supported with respect to its
6469    alignment.
6470    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6471    it is aligned, i.e., check if it is possible to vectorize it with different
6472    alignment.  */
6473
6474 enum dr_alignment_support
6475 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6476                                bool check_aligned_accesses)
6477 {
6478   data_reference *dr = dr_info->dr;
6479   stmt_vec_info stmt_info = dr_info->stmt;
6480   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6481   machine_mode mode = TYPE_MODE (vectype);
6482   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6483   struct loop *vect_loop = NULL;
6484   bool nested_in_vect_loop = false;
6485
6486   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6487     return dr_aligned;
6488
6489   /* For now assume all conditional loads/stores support unaligned
6490      access without any special code.  */
6491   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6492     if (gimple_call_internal_p (stmt)
6493         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6494             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6495       return dr_unaligned_supported;
6496
6497   if (loop_vinfo)
6498     {
6499       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6500       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6501     }
6502
6503   /* Possibly unaligned access.  */
6504
6505   /* We can choose between using the implicit realignment scheme (generating
6506      a misaligned_move stmt) and the explicit realignment scheme (generating
6507      aligned loads with a REALIGN_LOAD).  There are two variants to the
6508      explicit realignment scheme: optimized, and unoptimized.
6509      We can optimize the realignment only if the step between consecutive
6510      vector loads is equal to the vector size.  Since the vector memory
6511      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6512      is guaranteed that the misalignment amount remains the same throughout the
6513      execution of the vectorized loop.  Therefore, we can create the
6514      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6515      at the loop preheader.
6516
6517      However, in the case of outer-loop vectorization, when vectorizing a
6518      memory access in the inner-loop nested within the LOOP that is now being
6519      vectorized, while it is guaranteed that the misalignment of the
6520      vectorized memory access will remain the same in different outer-loop
6521      iterations, it is *not* guaranteed that is will remain the same throughout
6522      the execution of the inner-loop.  This is because the inner-loop advances
6523      with the original scalar step (and not in steps of VS).  If the inner-loop
6524      step happens to be a multiple of VS, then the misalignment remains fixed
6525      and we can use the optimized realignment scheme.  For example:
6526
6527       for (i=0; i<N; i++)
6528         for (j=0; j<M; j++)
6529           s += a[i+j];
6530
6531      When vectorizing the i-loop in the above example, the step between
6532      consecutive vector loads is 1, and so the misalignment does not remain
6533      fixed across the execution of the inner-loop, and the realignment cannot
6534      be optimized (as illustrated in the following pseudo vectorized loop):
6535
6536       for (i=0; i<N; i+=4)
6537         for (j=0; j<M; j++){
6538           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6539                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6540                          // (assuming that we start from an aligned address).
6541           }
6542
6543      We therefore have to use the unoptimized realignment scheme:
6544
6545       for (i=0; i<N; i+=4)
6546           for (j=k; j<M; j+=4)
6547           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6548                            // that the misalignment of the initial address is
6549                            // 0).
6550
6551      The loop can then be vectorized as follows:
6552
6553       for (k=0; k<4; k++){
6554         rt = get_realignment_token (&vp[k]);
6555         for (i=0; i<N; i+=4){
6556           v1 = vp[i+k];
6557           for (j=k; j<M; j+=4){
6558             v2 = vp[i+j+VS-1];
6559             va = REALIGN_LOAD <v1,v2,rt>;
6560             vs += va;
6561             v1 = v2;
6562           }
6563         }
6564     } */
6565
6566   if (DR_IS_READ (dr))
6567     {
6568       bool is_packed = false;
6569       tree type = (TREE_TYPE (DR_REF (dr)));
6570
6571       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6572           && (!targetm.vectorize.builtin_mask_for_load
6573               || targetm.vectorize.builtin_mask_for_load ()))
6574         {
6575           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6576
6577           /* If we are doing SLP then the accesses need not have the
6578              same alignment, instead it depends on the SLP group size.  */
6579           if (loop_vinfo
6580               && STMT_SLP_TYPE (stmt_info)
6581               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6582                               * (DR_GROUP_SIZE
6583                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6584                               TYPE_VECTOR_SUBPARTS (vectype)))
6585             ;
6586           else if (!loop_vinfo
6587                    || (nested_in_vect_loop
6588                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6589                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6590             return dr_explicit_realign;
6591           else
6592             return dr_explicit_realign_optimized;
6593         }
6594       if (!known_alignment_for_access_p (dr_info))
6595         is_packed = not_size_aligned (DR_REF (dr));
6596
6597       if (targetm.vectorize.support_vector_misalignment
6598             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599         /* Can't software pipeline the loads, but can at least do them.  */
6600         return dr_unaligned_supported;
6601     }
6602   else
6603     {
6604       bool is_packed = false;
6605       tree type = (TREE_TYPE (DR_REF (dr)));
6606
6607       if (!known_alignment_for_access_p (dr_info))
6608         is_packed = not_size_aligned (DR_REF (dr));
6609
6610      if (targetm.vectorize.support_vector_misalignment
6611            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6612        return dr_unaligned_supported;
6613     }
6614
6615   /* Unsupported.  */
6616   return dr_unaligned_unsupported;
6617 }