gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static opt_result
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return opt_result::failure_at (vect_location,
 166                                    "will not create alias checks, as"
 167                                    " --param vect-max-version-for-alias-checks"
 168                                    " == 0\n");
 169
 170   opt_result res
 171     = runtime_alias_check_p (ddr, loop,
 172                              optimize_loop_nest_for_speed_p (loop));
 173   if (!res)
 174     return res;
 175
 176   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 177   return opt_result::success ();
 178 }
 179
 180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 181
 182 static void
 183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 184 {
 185   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 186   for (unsigned int i = 0; i < checks.length(); ++i)
 187     if (checks[i] == value)
 188       return;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "need run-time check that %T is nonzero\n",
 193                      value);
 194   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 195 }
 196
 197 /* Return true if we know that the order of vectorized DR_INFO_A and
 198    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 199    DR_INFO_B.  At least one of the accesses is a write.  */
 200
 201 static bool
 202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 203 {
 204   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 205   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 206
 207   /* Single statements are always kept in their original order.  */
 208   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 209       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 210     return true;
 211
 212   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 213      group are emitted at the position of the last scalar load and all
 214      stores in a group are emitted at the position of the last scalar store.
 215      Compute that position and check whether the resulting order matches
 216      the current one.  */
 217   stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 218   if (last_a)
 219     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 220          s = DR_GROUP_NEXT_ELEMENT (s))
 221       last_a = get_later_stmt (last_a, s);
 222   else
 223     last_a = stmtinfo_a;
 224   stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 225   if (last_b)
 226     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 227          s = DR_GROUP_NEXT_ELEMENT (s))
 228       last_b = get_later_stmt (last_b, s);
 229   else
 230     last_b = stmtinfo_b;
 231   return ((get_later_stmt (last_a, last_b) == last_a)
 232           == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
 233 }
 234
 235 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 236    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 237    distances.  These distances are conservatively correct but they don't
 238    reflect a guaranteed dependence.
 239
 240    Return true if this function does all the work necessary to avoid
 241    an alias or false if the caller should use the dependence distances
 242    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 243    the depth of the loop described by LOOP_VINFO and the other arguments
 244    are as for vect_analyze_data_ref_dependence.  */
 245
 246 static bool
 247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 248                                        loop_vec_info loop_vinfo,
 249                                        int loop_depth, unsigned int *max_vf)
 250 {
 251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 252   lambda_vector dist_v;
 253   unsigned int i;
 254   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 255     {
 256       int dist = dist_v[loop_depth];
 257       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 258         {
 259           /* If the user asserted safelen >= DIST consecutive iterations
 260              can be executed concurrently, assume independence.
 261
 262              ??? An alternative would be to add the alias check even
 263              in this case, and vectorize the fallback loop with the
 264              maximum VF set to safelen.  However, if the user has
 265              explicitly given a length, it's less likely that that
 266              would be a win.  */
 267           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 268             {
 269               if ((unsigned int) loop->safelen < *max_vf)
 270                 *max_vf = loop->safelen;
 271               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272               continue;
 273             }
 274
 275           /* For dependence distances of 2 or more, we have the option
 276              of limiting VF or checking for an alias at runtime.
 277              Prefer to check at runtime if we can, to avoid limiting
 278              the VF unnecessarily when the bases are in fact independent.
 279
 280              Note that the alias checks will be removed if the VF ends up
 281              being small enough.  */
 282           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 283           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 284           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 285                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 286                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 287         }
 288     }
 289   return true;
 290 }
 291
 292
 293 /* Function vect_analyze_data_ref_dependence.
 294
 295    FIXME: I needed to change the sense of the returned flag.
 296
 297    Return FALSE if there (might) exist a dependence between a memory-reference
 298    DRA and a memory-reference DRB.  When versioning for alias may check a
 299    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 300    the data dependence.  */
 301
 302 static opt_result
 303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 304                                   loop_vec_info loop_vinfo,
 305                                   unsigned int *max_vf)
 306 {
 307   unsigned int i;
 308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   struct data_reference *dra = DDR_A (ddr);
 310   struct data_reference *drb = DDR_B (ddr);
 311   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 312   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 313   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 314   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 315   lambda_vector dist_v;
 316   unsigned int loop_depth;
 317
 318   /* In loop analysis all data references should be vectorizable.  */
 319   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 320       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 321     gcc_unreachable ();
 322
 323   /* Independent data accesses.  */
 324   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 325     return opt_result::success ();
 326
 327   if (dra == drb
 328       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 329     return opt_result::success ();
 330
 331   /* We do not have to consider dependences between accesses that belong
 332      to the same group, unless the stride could be smaller than the
 333      group size.  */
 334   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 335       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 336           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 337       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 338     return opt_result::success ();
 339
 340   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 341      least two scalar iterations, there is always also a true dependence.
 342      As the vectorizer does not re-order loads and stores we can ignore
 343      the anti-dependence if TBAA can disambiguate both DRs similar to the
 344      case with known negative distance anti-dependences (positive
 345      distance anti-dependences would violate TBAA constraints).  */
 346   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 347        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 348       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 349                                  get_alias_set (DR_REF (drb))))
 350     return opt_result::success ();
 351
 352   /* Unknown data dependence.  */
 353   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 354     {
 355       /* If user asserted safelen consecutive iterations can be
 356          executed concurrently, assume independence.  */
 357       if (loop->safelen >= 2)
 358         {
 359           if ((unsigned int) loop->safelen < *max_vf)
 360             *max_vf = loop->safelen;
 361           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 362           return opt_result::success ();
 363         }
 364
 365       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 366           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 367         return opt_result::failure_at
 368           (stmtinfo_a->stmt,
 369            "versioning for alias not supported for: "
 370            "can't determine dependence between %T and %T\n",
 371            DR_REF (dra), DR_REF (drb));
 372
 373       if (dump_enabled_p ())
 374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 375                          "versioning for alias required: "
 376                          "can't determine dependence between %T and %T\n",
 377                          DR_REF (dra), DR_REF (drb));
 378
 379       /* Add to list of ddrs that need to be tested at run-time.  */
 380       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 381     }
 382
 383   /* Known data dependence.  */
 384   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 385     {
 386       /* If user asserted safelen consecutive iterations can be
 387          executed concurrently, assume independence.  */
 388       if (loop->safelen >= 2)
 389         {
 390           if ((unsigned int) loop->safelen < *max_vf)
 391             *max_vf = loop->safelen;
 392           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 393           return opt_result::success ();
 394         }
 395
 396       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 397           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 398         return opt_result::failure_at
 399           (stmtinfo_a->stmt,
 400            "versioning for alias not supported for: "
 401            "bad dist vector for %T and %T\n",
 402            DR_REF (dra), DR_REF (drb));
 403
 404       if (dump_enabled_p ())
 405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 406                          "versioning for alias required: "
 407                          "bad dist vector for %T and %T\n",
 408                          DR_REF (dra), DR_REF (drb));
 409       /* Add to list of ddrs that need to be tested at run-time.  */
 410       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 411     }
 412
 413   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 414
 415   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 416       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 417                                                 loop_depth, max_vf))
 418     return opt_result::success ();
 419
 420   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 421     {
 422       int dist = dist_v[loop_depth];
 423
 424       if (dump_enabled_p ())
 425         dump_printf_loc (MSG_NOTE, vect_location,
 426                          "dependence distance  = %d.\n", dist);
 427
 428       if (dist == 0)
 429         {
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_NOTE, vect_location,
 432                              "dependence distance == 0 between %T and %T\n",
 433                              DR_REF (dra), DR_REF (drb));
 434
 435           /* When we perform grouped accesses and perform implicit CSE
 436              by detecting equal accesses and doing disambiguation with
 437              runtime alias tests like for
 438                 .. = a[i];
 439                 .. = a[i+1];
 440                 a[i] = ..;
 441                 a[i+1] = ..;
 442                 *p = ..;
 443                 .. = a[i];
 444                 .. = a[i+1];
 445              where we will end up loading { a[i], a[i+1] } once, make
 446              sure that inserting group loads before the first load and
 447              stores after the last store will do the right thing.
 448              Similar for groups like
 449                 a[i] = ...;
 450                 ... = a[i];
 451                 a[i+1] = ...;
 452              where loads from the group interleave with the store.  */
 453           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 454             return opt_result::failure_at (stmtinfo_a->stmt,
 455                                            "READ_WRITE dependence"
 456                                            " in interleaving.\n");
 457
 458           if (loop->safelen < 2)
 459             {
 460               tree indicator = dr_zero_step_indicator (dra);
 461               if (!indicator || integer_zerop (indicator))
 462                 return opt_result::failure_at (stmtinfo_a->stmt,
 463                                                "access also has a zero step\n");
 464               else if (TREE_CODE (indicator) != INTEGER_CST)
 465                 vect_check_nonzero_value (loop_vinfo, indicator);
 466             }
 467           continue;
 468         }
 469
 470       if (dist > 0 && DDR_REVERSED_P (ddr))
 471         {
 472           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 473              reversed (to make distance vector positive), and the actual
 474              distance is negative.  */
 475           if (dump_enabled_p ())
 476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                              "dependence distance negative.\n");
 478           /* Record a negative dependence distance to later limit the
 479              amount of stmt copying / unrolling we can perform.
 480              Only need to handle read-after-write dependence.  */
 481           if (DR_IS_READ (drb)
 482               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 483                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 484             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 485           continue;
 486         }
 487
 488       unsigned int abs_dist = abs (dist);
 489       if (abs_dist >= 2 && abs_dist < *max_vf)
 490         {
 491           /* The dependence distance requires reduction of the maximal
 492              vectorization factor.  */
 493           *max_vf = abs (dist);
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location,
 496                              "adjusting maximal vectorization factor to %i\n",
 497                              *max_vf);
 498         }
 499
 500       if (abs_dist >= *max_vf)
 501         {
 502           /* Dependence distance does not create dependence, as far as
 503              vectorization is concerned, in this case.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "dependence distance >= VF.\n");
 507           continue;
 508         }
 509
 510       return opt_result::failure_at (stmtinfo_a->stmt,
 511                                      "not vectorized, possible dependence "
 512                                      "between data-refs %T and %T\n",
 513                                      DR_REF (dra), DR_REF (drb));
 514     }
 515
 516   return opt_result::success ();
 517 }
 518
 519 /* Function vect_analyze_data_ref_dependences.
 520
 521    Examine all the data references in the loop, and make sure there do not
 522    exist any data dependences between them.  Set *MAX_VF according to
 523    the maximum vectorization factor the data dependences allow.  */
 524
 525 opt_result
 526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 527                                    unsigned int *max_vf)
 528 {
 529   unsigned int i;
 530   struct data_dependence_relation *ddr;
 531
 532   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 533
 534   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 535     {
 536       LOOP_VINFO_DDRS (loop_vinfo)
 537         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 538                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 539       /* We need read-read dependences to compute
 540          STMT_VINFO_SAME_ALIGN_REFS.  */
 541       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 542                                           &LOOP_VINFO_DDRS (loop_vinfo),
 543                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 544                                           true);
 545       gcc_assert (res);
 546     }
 547
 548   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 549
 550   /* For epilogues we either have no aliases or alias versioning
 551      was applied to original loop.  Therefore we may just get max_vf
 552      using VF of original loop.  */
 553   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 554     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 555   else
 556     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 557       {
 558         opt_result res
 559           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 560         if (!res)
 561           return res;
 562       }
 563
 564   return opt_result::success ();
 565 }
 566
 567
 568 /* Function vect_slp_analyze_data_ref_dependence.
 569
 570    Return TRUE if there (might) exist a dependence between a memory-reference
 571    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 572    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 573    according to the data dependence.  */
 574
 575 static bool
 576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 577                                       struct data_dependence_relation *ddr)
 578 {
 579   struct data_reference *dra = DDR_A (ddr);
 580   struct data_reference *drb = DDR_B (ddr);
 581   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 582   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 583
 584   /* We need to check dependences of statements marked as unvectorizable
 585      as well, they still can prohibit vectorization.  */
 586
 587   /* Independent data accesses.  */
 588   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 589     return false;
 590
 591   if (dra == drb)
 592     return false;
 593
 594   /* Read-read is OK.  */
 595   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 596     return false;
 597
 598   /* If dra and drb are part of the same interleaving chain consider
 599      them independent.  */
 600   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 601       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 602           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 603     return false;
 604
 605   /* Unknown data dependence.  */
 606   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 607     {
 608       if  (dump_enabled_p ())
 609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 610                          "can't determine dependence between %T and %T\n",
 611                          DR_REF (dra), DR_REF (drb));
 612     }
 613   else if (dump_enabled_p ())
 614     dump_printf_loc (MSG_NOTE, vect_location,
 615                      "determined dependence between %T and %T\n",
 616                      DR_REF (dra), DR_REF (drb));
 617
 618   return true;
 619 }
 620
 621
 622 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 623    contain the vector of scalar stores of this instance if we are
 624    disambiguating the loads.  */
 625
 626 static bool
 627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 628                                    vec<stmt_vec_info> stores,
 629                                    stmt_vec_info last_store_info)
 630 {
 631   /* This walks over all stmts involved in the SLP load/store done
 632      in NODE verifying we can sink them up to the last stmt in the
 633      group.  */
 634   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 635   vec_info *vinfo = last_access_info->vinfo;
 636   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 637     {
 638       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 639       if (access_info == last_access_info)
 640         continue;
 641       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 642       ao_ref ref;
 643       bool ref_initialized_p = false;
 644       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 645            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 646         {
 647           gimple *stmt = gsi_stmt (gsi);
 648           if (! gimple_vuse (stmt)
 649               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 650             continue;
 651
 652           /* If we couldn't record a (single) data reference for this
 653              stmt we have to resort to the alias oracle.  */
 654           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 655           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 656           if (!dr_b)
 657             {
 658               /* We are moving a store or sinking a load - this means
 659                  we cannot use TBAA for disambiguation.  */
 660               if (!ref_initialized_p)
 661                 ao_ref_init (&ref, DR_REF (dr_a));
 662               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 663                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 664                 return false;
 665               continue;
 666             }
 667
 668           bool dependent = false;
 669           /* If we run into a store of this same instance (we've just
 670              marked those) then delay dependence checking until we run
 671              into the last store because this is where it will have
 672              been sunk to (and we verify if we can do that as well).  */
 673           if (gimple_visited_p (stmt))
 674             {
 675               if (stmt_info != last_store_info)
 676                 continue;
 677               unsigned i;
 678               stmt_vec_info store_info;
 679               FOR_EACH_VEC_ELT (stores, i, store_info)
 680                 {
 681                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 682                   ddr_p ddr = initialize_data_dependence_relation
 683                                 (dr_a, store_dr, vNULL);
 684                   dependent
 685                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 686                   free_dependence_relation (ddr);
 687                   if (dependent)
 688                     break;
 689                 }
 690             }
 691           else
 692             {
 693               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 694                                                                dr_b, vNULL);
 695               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 696               free_dependence_relation (ddr);
 697             }
 698           if (dependent)
 699             return false;
 700         }
 701     }
 702   return true;
 703 }
 704
 705
 706 /* Function vect_analyze_data_ref_dependences.
 707
 708    Examine all the data references in the basic-block, and make sure there
 709    do not exist any data dependences between them.  Set *MAX_VF according to
 710    the maximum vectorization factor the data dependences allow.  */
 711
 712 bool
 713 vect_slp_analyze_instance_dependence (slp_instance instance)
 714 {
 715   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 716
 717   /* The stores of this instance are at the root of the SLP tree.  */
 718   slp_tree store = SLP_INSTANCE_TREE (instance);
 719   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 720     store = NULL;
 721
 722   /* Verify we can sink stores to the vectorized stmt insert location.  */
 723   stmt_vec_info last_store_info = NULL;
 724   if (store)
 725     {
 726       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 727         return false;
 728
 729       /* Mark stores in this instance and remember the last one.  */
 730       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 731       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 732         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 733     }
 734
 735   bool res = true;
 736
 737   /* Verify we can sink loads to the vectorized stmt insert location,
 738      special-casing stores of this instance.  */
 739   slp_tree load;
 740   unsigned int i;
 741   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 742     if (! vect_slp_analyze_node_dependences (instance, load,
 743                                              store
 744                                              ? SLP_TREE_SCALAR_STMTS (store)
 745                                              : vNULL, last_store_info))
 746       {
 747         res = false;
 748         break;
 749       }
 750
 751   /* Unset the visited flag.  */
 752   if (store)
 753     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 754       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 755
 756   return res;
 757 }
 758
 759 /* Record the base alignment guarantee given by DRB, which occurs
 760    in STMT_INFO.  */
 761
 762 static void
 763 vect_record_base_alignment (stmt_vec_info stmt_info,
 764                             innermost_loop_behavior *drb)
 765 {
 766   vec_info *vinfo = stmt_info->vinfo;
 767   bool existed;
 768   innermost_loop_behavior *&entry
 769     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 770   if (!existed || entry->base_alignment < drb->base_alignment)
 771     {
 772       entry = drb;
 773       if (dump_enabled_p ())
 774         dump_printf_loc (MSG_NOTE, vect_location,
 775                          "recording new base alignment for %T\n"
 776                          "  alignment:    %d\n"
 777                          "  misalignment: %d\n"
 778                          "  based on:     %G",
 779                          drb->base_address,
 780                          drb->base_alignment,
 781                          drb->base_misalignment,
 782                          stmt_info->stmt);
 783     }
 784 }
 785
 786 /* If the region we're going to vectorize is reached, all unconditional
 787    data references occur at least once.  We can therefore pool the base
 788    alignment guarantees from each unconditional reference.  Do this by
 789    going through all the data references in VINFO and checking whether
 790    the containing statement makes the reference unconditionally.  If so,
 791    record the alignment of the base address in VINFO so that it can be
 792    used for all other references with the same base.  */
 793
 794 void
 795 vect_record_base_alignments (vec_info *vinfo)
 796 {
 797   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 798   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 799   data_reference *dr;
 800   unsigned int i;
 801   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 802     {
 803       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 804       stmt_vec_info stmt_info = dr_info->stmt;
 805       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 806           && STMT_VINFO_VECTORIZABLE (stmt_info)
 807           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 808         {
 809           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 810
 811           /* If DR is nested in the loop that is being vectorized, we can also
 812              record the alignment of the base wrt the outer loop.  */
 813           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 814             vect_record_base_alignment
 815               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 816         }
 817     }
 818 }
 819
 820 /* Return the target alignment for the vectorized form of DR_INFO.  */
 821
 822 static unsigned int
 823 vect_calculate_target_alignment (dr_vec_info *dr_info)
 824 {
 825   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 826   return targetm.vectorize.preferred_vector_alignment (vectype);
 827 }
 828
 829 /* Function vect_compute_data_ref_alignment
 830
 831    Compute the misalignment of the data reference DR_INFO.
 832
 833    Output:
 834    1. DR_MISALIGNMENT (DR_INFO) is defined.
 835
 836    FOR NOW: No analysis is actually performed. Misalignment is calculated
 837    only for trivial cases. TODO.  */
 838
 839 static void
 840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 841 {
 842   stmt_vec_info stmt_info = dr_info->stmt;
 843   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 844   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 845   struct loop *loop = NULL;
 846   tree ref = DR_REF (dr_info->dr);
 847   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 848
 849   if (dump_enabled_p ())
 850     dump_printf_loc (MSG_NOTE, vect_location,
 851                      "vect_compute_data_ref_alignment:\n");
 852
 853   if (loop_vinfo)
 854     loop = LOOP_VINFO_LOOP (loop_vinfo);
 855
 856   /* Initialize misalignment to unknown.  */
 857   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 858
 859   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 860     return;
 861
 862   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 863   bool step_preserves_misalignment_p;
 864
 865   unsigned HOST_WIDE_INT vector_alignment
 866     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 867   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 868
 869   /* No step for BB vectorization.  */
 870   if (!loop)
 871     {
 872       gcc_assert (integer_zerop (drb->step));
 873       step_preserves_misalignment_p = true;
 874     }
 875
 876   /* In case the dataref is in an inner-loop of the loop that is being
 877      vectorized (LOOP), we use the base and misalignment information
 878      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 879      stays the same throughout the execution of the inner-loop, which is why
 880      we have to check that the stride of the dataref in the inner-loop evenly
 881      divides by the vector alignment.  */
 882   else if (nested_in_vect_loop_p (loop, stmt_info))
 883     {
 884       step_preserves_misalignment_p
 885         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 886
 887       if (dump_enabled_p ())
 888         {
 889           if (step_preserves_misalignment_p)
 890             dump_printf_loc (MSG_NOTE, vect_location,
 891                              "inner step divides the vector alignment.\n");
 892           else
 893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 894                              "inner step doesn't divide the vector"
 895                              " alignment.\n");
 896         }
 897     }
 898
 899   /* Similarly we can only use base and misalignment information relative to
 900      an innermost loop if the misalignment stays the same throughout the
 901      execution of the loop.  As above, this is the case if the stride of
 902      the dataref evenly divides by the alignment.  */
 903   else
 904     {
 905       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 906       step_preserves_misalignment_p
 907         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 908
 909       if (!step_preserves_misalignment_p && dump_enabled_p ())
 910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                          "step doesn't divide the vector alignment.\n");
 912     }
 913
 914   unsigned int base_alignment = drb->base_alignment;
 915   unsigned int base_misalignment = drb->base_misalignment;
 916
 917   /* Calculate the maximum of the pooled base address alignment and the
 918      alignment that we can compute for DR itself.  */
 919   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 920   if (entry && base_alignment < (*entry)->base_alignment)
 921     {
 922       base_alignment = (*entry)->base_alignment;
 923       base_misalignment = (*entry)->base_misalignment;
 924     }
 925
 926   if (drb->offset_alignment < vector_alignment
 927       || !step_preserves_misalignment_p
 928       /* We need to know whether the step wrt the vectorized loop is
 929          negative when computing the starting misalignment below.  */
 930       || TREE_CODE (drb->step) != INTEGER_CST)
 931     {
 932       if (dump_enabled_p ())
 933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 934                          "Unknown alignment for access: %T\n", ref);
 935       return;
 936     }
 937
 938   if (base_alignment < vector_alignment)
 939     {
 940       unsigned int max_alignment;
 941       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 942       if (max_alignment < vector_alignment
 943           || !vect_can_force_dr_alignment_p (base,
 944                                              vector_alignment * BITS_PER_UNIT))
 945         {
 946           if (dump_enabled_p ())
 947             dump_printf_loc (MSG_NOTE, vect_location,
 948                              "can't force alignment of ref: %T\n", ref);
 949           return;
 950         }
 951
 952       /* Force the alignment of the decl.
 953          NOTE: This is the only change to the code we make during
 954          the analysis phase, before deciding to vectorize the loop.  */
 955       if (dump_enabled_p ())
 956         dump_printf_loc (MSG_NOTE, vect_location,
 957                          "force alignment of %T\n", ref);
 958
 959       dr_info->base_decl = base;
 960       dr_info->base_misaligned = true;
 961       base_misalignment = 0;
 962     }
 963   poly_int64 misalignment
 964     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 965
 966   /* If this is a backward running DR then first access in the larger
 967      vectype actually is N-1 elements before the address in the DR.
 968      Adjust misalign accordingly.  */
 969   if (tree_int_cst_sgn (drb->step) < 0)
 970     /* PLUS because STEP is negative.  */
 971     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 972                      * TREE_INT_CST_LOW (drb->step));
 973
 974   unsigned int const_misalignment;
 975   if (!known_misalignment (misalignment, vector_alignment,
 976                            &const_misalignment))
 977     {
 978       if (dump_enabled_p ())
 979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                          "Non-constant misalignment for access: %T\n", ref);
 981       return;
 982     }
 983
 984   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
 985
 986   if (dump_enabled_p ())
 987     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 988                      "misalign = %d bytes of ref %T\n",
 989                      DR_MISALIGNMENT (dr_info), ref);
 990
 991   return;
 992 }
 993
 994 /* Function vect_update_misalignment_for_peel.
 995    Sets DR_INFO's misalignment
 996    - to 0 if it has the same alignment as DR_PEEL_INFO,
 997    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
 998    - to -1 (unknown) otherwise.
 999
1000    DR_INFO - the data reference whose misalignment is to be adjusted.
1001    DR_PEEL_INFO - the data reference whose misalignment is being made
1002                   zero in the vector loop by the peel.
1003    NPEEL - the number of iterations in the peel loop if the misalignment
1004            of DR_PEEL_INFO is known at compile time.  */
1005
1006 static void
1007 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1008                                    dr_vec_info *dr_peel_info, int npeel)
1009 {
1010   unsigned int i;
1011   vec<dr_p> same_aligned_drs;
1012   struct data_reference *current_dr;
1013   int dr_size = vect_get_scalar_dr_size (dr_info);
1014   int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1015   stmt_vec_info stmt_info = dr_info->stmt;
1016   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1017
1018  /* For interleaved data accesses the step in the loop must be multiplied by
1019      the size of the interleaving group.  */
1020   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1021     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1022   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1023     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1024
1025   /* It can be assumed that the data refs with the same alignment as dr_peel
1026      are aligned in the vector loop.  */
1027   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1028   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1029     {
1030       if (current_dr != dr_info->dr)
1031         continue;
1032       gcc_assert (!known_alignment_for_access_p (dr_info)
1033                   || !known_alignment_for_access_p (dr_peel_info)
1034                   || (DR_MISALIGNMENT (dr_info) / dr_size
1035                       == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1036       SET_DR_MISALIGNMENT (dr_info, 0);
1037       return;
1038     }
1039
1040   if (known_alignment_for_access_p (dr_info)
1041       && known_alignment_for_access_p (dr_peel_info))
1042     {
1043       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1044                                             size_zero_node) < 0;
1045       int misal = DR_MISALIGNMENT (dr_info);
1046       misal += negative ? -npeel * dr_size : npeel * dr_size;
1047       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1048       SET_DR_MISALIGNMENT (dr_info, misal);
1049       return;
1050     }
1051
1052   if (dump_enabled_p ())
1053     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1054                      "to unknown (-1).\n");
1055   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1056 }
1057
1058
1059 /* Function verify_data_ref_alignment
1060
1061    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1062
1063 static opt_result
1064 verify_data_ref_alignment (dr_vec_info *dr_info)
1065 {
1066   enum dr_alignment_support supportable_dr_alignment
1067     = vect_supportable_dr_alignment (dr_info, false);
1068   if (!supportable_dr_alignment)
1069     return opt_result::failure_at
1070       (dr_info->stmt->stmt,
1071        DR_IS_READ (dr_info->dr)
1072         ? "not vectorized: unsupported unaligned load: %T\n"
1073         : "not vectorized: unsupported unaligned store: %T\n",
1074        DR_REF (dr_info->dr));
1075
1076   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1077     dump_printf_loc (MSG_NOTE, vect_location,
1078                      "Vectorizing an unaligned access.\n");
1079
1080   return opt_result::success ();
1081 }
1082
1083 /* Function vect_verify_datarefs_alignment
1084
1085    Return TRUE if all data references in the loop can be
1086    handled with respect to alignment.  */
1087
1088 opt_result
1089 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1090 {
1091   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1092   struct data_reference *dr;
1093   unsigned int i;
1094
1095   FOR_EACH_VEC_ELT (datarefs, i, dr)
1096     {
1097       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1098       stmt_vec_info stmt_info = dr_info->stmt;
1099
1100       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1101         continue;
1102
1103       /* For interleaving, only the alignment of the first access matters.   */
1104       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1105           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1106         continue;
1107
1108       /* Strided accesses perform only component accesses, alignment is
1109          irrelevant for them.  */
1110       if (STMT_VINFO_STRIDED_P (stmt_info)
1111           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1112         continue;
1113
1114       opt_result res = verify_data_ref_alignment (dr_info);
1115       if (!res)
1116         return res;
1117     }
1118
1119   return opt_result::success ();
1120 }
1121
1122 /* Given an memory reference EXP return whether its alignment is less
1123    than its size.  */
1124
1125 static bool
1126 not_size_aligned (tree exp)
1127 {
1128   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1129     return true;
1130
1131   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1132           > get_object_alignment (exp));
1133 }
1134
1135 /* Function vector_alignment_reachable_p
1136
1137    Return true if vector alignment for DR_INFO is reachable by peeling
1138    a few loop iterations.  Return false otherwise.  */
1139
1140 static bool
1141 vector_alignment_reachable_p (dr_vec_info *dr_info)
1142 {
1143   stmt_vec_info stmt_info = dr_info->stmt;
1144   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1145
1146   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1147     {
1148       /* For interleaved access we peel only if number of iterations in
1149          the prolog loop ({VF - misalignment}), is a multiple of the
1150          number of the interleaved accesses.  */
1151       int elem_size, mis_in_elements;
1152
1153       /* FORNOW: handle only known alignment.  */
1154       if (!known_alignment_for_access_p (dr_info))
1155         return false;
1156
1157       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1158       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1159       elem_size = vector_element_size (vector_size, nelements);
1160       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1161
1162       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1163         return false;
1164     }
1165
1166   /* If misalignment is known at the compile time then allow peeling
1167      only if natural alignment is reachable through peeling.  */
1168   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1169     {
1170       HOST_WIDE_INT elmsize =
1171                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1172       if (dump_enabled_p ())
1173         {
1174           dump_printf_loc (MSG_NOTE, vect_location,
1175                            "data size = %wd. misalignment = %d.\n", elmsize,
1176                            DR_MISALIGNMENT (dr_info));
1177         }
1178       if (DR_MISALIGNMENT (dr_info) % elmsize)
1179         {
1180           if (dump_enabled_p ())
1181             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182                              "data size does not divide the misalignment.\n");
1183           return false;
1184         }
1185     }
1186
1187   if (!known_alignment_for_access_p (dr_info))
1188     {
1189       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1190       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1191       if (dump_enabled_p ())
1192         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                          "Unknown misalignment, %snaturally aligned\n",
1194                          is_packed ? "not " : "");
1195       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1196     }
1197
1198   return true;
1199 }
1200
1201
1202 /* Calculate the cost of the memory access represented by DR_INFO.  */
1203
1204 static void
1205 vect_get_data_access_cost (dr_vec_info *dr_info,
1206                            unsigned int *inside_cost,
1207                            unsigned int *outside_cost,
1208                            stmt_vector_for_cost *body_cost_vec,
1209                            stmt_vector_for_cost *prologue_cost_vec)
1210 {
1211   stmt_vec_info stmt_info = dr_info->stmt;
1212   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1213   int ncopies;
1214
1215   if (PURE_SLP_STMT (stmt_info))
1216     ncopies = 1;
1217   else
1218     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1219
1220   if (DR_IS_READ (dr_info->dr))
1221     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1222                         prologue_cost_vec, body_cost_vec, false);
1223   else
1224     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1225
1226   if (dump_enabled_p ())
1227     dump_printf_loc (MSG_NOTE, vect_location,
1228                      "vect_get_data_access_cost: inside_cost = %d, "
1229                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1230 }
1231
1232
1233 typedef struct _vect_peel_info
1234 {
1235   dr_vec_info *dr_info;
1236   int npeel;
1237   unsigned int count;
1238 } *vect_peel_info;
1239
1240 typedef struct _vect_peel_extended_info
1241 {
1242   struct _vect_peel_info peel_info;
1243   unsigned int inside_cost;
1244   unsigned int outside_cost;
1245 } *vect_peel_extended_info;
1246
1247
1248 /* Peeling hashtable helpers.  */
1249
1250 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1251 {
1252   static inline hashval_t hash (const _vect_peel_info *);
1253   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1254 };
1255
1256 inline hashval_t
1257 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1258 {
1259   return (hashval_t) peel_info->npeel;
1260 }
1261
1262 inline bool
1263 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1264 {
1265   return (a->npeel == b->npeel);
1266 }
1267
1268
1269 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1270
1271 static void
1272 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1273                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1274                           int npeel)
1275 {
1276   struct _vect_peel_info elem, *slot;
1277   _vect_peel_info **new_slot;
1278   bool supportable_dr_alignment
1279     = vect_supportable_dr_alignment (dr_info, true);
1280
1281   elem.npeel = npeel;
1282   slot = peeling_htab->find (&elem);
1283   if (slot)
1284     slot->count++;
1285   else
1286     {
1287       slot = XNEW (struct _vect_peel_info);
1288       slot->npeel = npeel;
1289       slot->dr_info = dr_info;
1290       slot->count = 1;
1291       new_slot = peeling_htab->find_slot (slot, INSERT);
1292       *new_slot = slot;
1293     }
1294
1295   if (!supportable_dr_alignment
1296       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1297     slot->count += VECT_MAX_COST;
1298 }
1299
1300
1301 /* Traverse peeling hash table to find peeling option that aligns maximum
1302    number of data accesses.  */
1303
1304 int
1305 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1306                                      _vect_peel_extended_info *max)
1307 {
1308   vect_peel_info elem = *slot;
1309
1310   if (elem->count > max->peel_info.count
1311       || (elem->count == max->peel_info.count
1312           && max->peel_info.npeel > elem->npeel))
1313     {
1314       max->peel_info.npeel = elem->npeel;
1315       max->peel_info.count = elem->count;
1316       max->peel_info.dr_info = elem->dr_info;
1317     }
1318
1319   return 1;
1320 }
1321
1322 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1323    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1324    we assume DR0_INFO's misalignment will be zero after peeling.  */
1325
1326 static void
1327 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1328                                 dr_vec_info *dr0_info,
1329                                 unsigned int *inside_cost,
1330                                 unsigned int *outside_cost,
1331                                 stmt_vector_for_cost *body_cost_vec,
1332                                 stmt_vector_for_cost *prologue_cost_vec,
1333                                 unsigned int npeel,
1334                                 bool unknown_misalignment)
1335 {
1336   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1337   unsigned i;
1338   data_reference *dr;
1339
1340   FOR_EACH_VEC_ELT (datarefs, i, dr)
1341     {
1342       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1343       stmt_vec_info stmt_info = dr_info->stmt;
1344       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1345         continue;
1346
1347       /* For interleaving, only the alignment of the first access
1348          matters.  */
1349       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1350           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1351         continue;
1352
1353       /* Strided accesses perform only component accesses, alignment is
1354          irrelevant for them.  */
1355       if (STMT_VINFO_STRIDED_P (stmt_info)
1356           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1357         continue;
1358
1359       int save_misalignment;
1360       save_misalignment = DR_MISALIGNMENT (dr_info);
1361       if (npeel == 0)
1362         ;
1363       else if (unknown_misalignment && dr_info == dr0_info)
1364         SET_DR_MISALIGNMENT (dr_info, 0);
1365       else
1366         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1367       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1368                                  body_cost_vec, prologue_cost_vec);
1369       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1370     }
1371 }
1372
1373 /* Traverse peeling hash table and calculate cost for each peeling option.
1374    Find the one with the lowest cost.  */
1375
1376 int
1377 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1378                                    _vect_peel_extended_info *min)
1379 {
1380   vect_peel_info elem = *slot;
1381   int dummy;
1382   unsigned int inside_cost = 0, outside_cost = 0;
1383   stmt_vec_info stmt_info = elem->dr_info->stmt;
1384   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1385   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1386                        epilogue_cost_vec;
1387
1388   prologue_cost_vec.create (2);
1389   body_cost_vec.create (2);
1390   epilogue_cost_vec.create (2);
1391
1392   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1393                                   &outside_cost, &body_cost_vec,
1394                                   &prologue_cost_vec, elem->npeel, false);
1395
1396   body_cost_vec.release ();
1397
1398   outside_cost += vect_get_known_peeling_cost
1399     (loop_vinfo, elem->npeel, &dummy,
1400      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1401      &prologue_cost_vec, &epilogue_cost_vec);
1402
1403   /* Prologue and epilogue costs are added to the target model later.
1404      These costs depend only on the scalar iteration cost, the
1405      number of peeling iterations finally chosen, and the number of
1406      misaligned statements.  So discard the information found here.  */
1407   prologue_cost_vec.release ();
1408   epilogue_cost_vec.release ();
1409
1410   if (inside_cost < min->inside_cost
1411       || (inside_cost == min->inside_cost
1412           && outside_cost < min->outside_cost))
1413     {
1414       min->inside_cost = inside_cost;
1415       min->outside_cost = outside_cost;
1416       min->peel_info.dr_info = elem->dr_info;
1417       min->peel_info.npeel = elem->npeel;
1418       min->peel_info.count = elem->count;
1419     }
1420
1421   return 1;
1422 }
1423
1424
1425 /* Choose best peeling option by traversing peeling hash table and either
1426    choosing an option with the lowest cost (if cost model is enabled) or the
1427    option that aligns as many accesses as possible.  */
1428
1429 static struct _vect_peel_extended_info
1430 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1431                                        loop_vec_info loop_vinfo)
1432 {
1433    struct _vect_peel_extended_info res;
1434
1435    res.peel_info.dr_info = NULL;
1436
1437    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1438      {
1439        res.inside_cost = INT_MAX;
1440        res.outside_cost = INT_MAX;
1441        peeling_htab->traverse <_vect_peel_extended_info *,
1442                                vect_peeling_hash_get_lowest_cost> (&res);
1443      }
1444    else
1445      {
1446        res.peel_info.count = 0;
1447        peeling_htab->traverse <_vect_peel_extended_info *,
1448                                vect_peeling_hash_get_most_frequent> (&res);
1449        res.inside_cost = 0;
1450        res.outside_cost = 0;
1451      }
1452
1453    return res;
1454 }
1455
1456 /* Return true if the new peeling NPEEL is supported.  */
1457
1458 static bool
1459 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1460                           unsigned npeel)
1461 {
1462   unsigned i;
1463   struct data_reference *dr = NULL;
1464   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1465   enum dr_alignment_support supportable_dr_alignment;
1466
1467   /* Ensure that all data refs can be vectorized after the peel.  */
1468   FOR_EACH_VEC_ELT (datarefs, i, dr)
1469     {
1470       int save_misalignment;
1471
1472       if (dr == dr0_info->dr)
1473         continue;
1474
1475       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1476       stmt_vec_info stmt_info = dr_info->stmt;
1477       /* For interleaving, only the alignment of the first access
1478          matters.  */
1479       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1480           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1481         continue;
1482
1483       /* Strided accesses perform only component accesses, alignment is
1484          irrelevant for them.  */
1485       if (STMT_VINFO_STRIDED_P (stmt_info)
1486           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1487         continue;
1488
1489       save_misalignment = DR_MISALIGNMENT (dr_info);
1490       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1491       supportable_dr_alignment
1492         = vect_supportable_dr_alignment (dr_info, false);
1493       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1494
1495       if (!supportable_dr_alignment)
1496         return false;
1497     }
1498
1499   return true;
1500 }
1501
1502 /* Function vect_enhance_data_refs_alignment
1503
1504    This pass will use loop versioning and loop peeling in order to enhance
1505    the alignment of data references in the loop.
1506
1507    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1508    original loop is to be vectorized.  Any other loops that are created by
1509    the transformations performed in this pass - are not supposed to be
1510    vectorized.  This restriction will be relaxed.
1511
1512    This pass will require a cost model to guide it whether to apply peeling
1513    or versioning or a combination of the two.  For example, the scheme that
1514    intel uses when given a loop with several memory accesses, is as follows:
1515    choose one memory access ('p') which alignment you want to force by doing
1516    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1517    other accesses are not necessarily aligned, or (2) use loop versioning to
1518    generate one loop in which all accesses are aligned, and another loop in
1519    which only 'p' is necessarily aligned.
1520
1521    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1522    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1523    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1524
1525    Devising a cost model is the most critical aspect of this work.  It will
1526    guide us on which access to peel for, whether to use loop versioning, how
1527    many versions to create, etc.  The cost model will probably consist of
1528    generic considerations as well as target specific considerations (on
1529    powerpc for example, misaligned stores are more painful than misaligned
1530    loads).
1531
1532    Here are the general steps involved in alignment enhancements:
1533
1534      -- original loop, before alignment analysis:
1535         for (i=0; i<N; i++){
1536           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1537           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1538         }
1539
1540      -- After vect_compute_data_refs_alignment:
1541         for (i=0; i<N; i++){
1542           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1543           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1544         }
1545
1546      -- Possibility 1: we do loop versioning:
1547      if (p is aligned) {
1548         for (i=0; i<N; i++){    # loop 1A
1549           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1550           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1551         }
1552      }
1553      else {
1554         for (i=0; i<N; i++){    # loop 1B
1555           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1556           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1557         }
1558      }
1559
1560      -- Possibility 2: we do loop peeling:
1561      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1562         x = q[i];
1563         p[i] = y;
1564      }
1565      for (i = 3; i < N; i++){   # loop 2A
1566         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1567         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1568      }
1569
1570      -- Possibility 3: combination of loop peeling and versioning:
1571      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1572         x = q[i];
1573         p[i] = y;
1574      }
1575      if (p is aligned) {
1576         for (i = 3; i<N; i++){  # loop 3A
1577           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1578           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1579         }
1580      }
1581      else {
1582         for (i = 3; i<N; i++){  # loop 3B
1583           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1584           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1585         }
1586      }
1587
1588      These loops are later passed to loop_transform to be vectorized.  The
1589      vectorizer will use the alignment information to guide the transformation
1590      (whether to generate regular loads/stores, or with special handling for
1591      misalignment).  */
1592
1593 opt_result
1594 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1595 {
1596   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1597   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1598   enum dr_alignment_support supportable_dr_alignment;
1599   dr_vec_info *first_store = NULL;
1600   dr_vec_info *dr0_info = NULL;
1601   struct data_reference *dr;
1602   unsigned int i, j;
1603   bool do_peeling = false;
1604   bool do_versioning = false;
1605   unsigned int npeel = 0;
1606   bool one_misalignment_known = false;
1607   bool one_misalignment_unknown = false;
1608   bool one_dr_unsupportable = false;
1609   dr_vec_info *unsupportable_dr_info = NULL;
1610   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1611   unsigned possible_npeel_number = 1;
1612   tree vectype;
1613   unsigned int mis, same_align_drs_max = 0;
1614   hash_table<peel_info_hasher> peeling_htab (1);
1615
1616   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1617
1618   /* Reset data so we can safely be called multiple times.  */
1619   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1620   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1621
1622   /* While cost model enhancements are expected in the future, the high level
1623      view of the code at this time is as follows:
1624
1625      A) If there is a misaligned access then see if peeling to align
1626         this access can make all data references satisfy
1627         vect_supportable_dr_alignment.  If so, update data structures
1628         as needed and return true.
1629
1630      B) If peeling wasn't possible and there is a data reference with an
1631         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1632         then see if loop versioning checks can be used to make all data
1633         references satisfy vect_supportable_dr_alignment.  If so, update
1634         data structures as needed and return true.
1635
1636      C) If neither peeling nor versioning were successful then return false if
1637         any data reference does not satisfy vect_supportable_dr_alignment.
1638
1639      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1640
1641      Note, Possibility 3 above (which is peeling and versioning together) is not
1642      being done at this time.  */
1643
1644   /* (1) Peeling to force alignment.  */
1645
1646   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1647      Considerations:
1648      + How many accesses will become aligned due to the peeling
1649      - How many accesses will become unaligned due to the peeling,
1650        and the cost of misaligned accesses.
1651      - The cost of peeling (the extra runtime checks, the increase
1652        in code size).  */
1653
1654   FOR_EACH_VEC_ELT (datarefs, i, dr)
1655     {
1656       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1657       stmt_vec_info stmt_info = dr_info->stmt;
1658
1659       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1660         continue;
1661
1662       /* For interleaving, only the alignment of the first access
1663          matters.  */
1664       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1665           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1666         continue;
1667
1668       /* For scatter-gather or invariant accesses there is nothing
1669          to enhance.  */
1670       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1671           || integer_zerop (DR_STEP (dr)))
1672         continue;
1673
1674       /* Strided accesses perform only component accesses, alignment is
1675          irrelevant for them.  */
1676       if (STMT_VINFO_STRIDED_P (stmt_info)
1677           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1678         continue;
1679
1680       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1681       do_peeling = vector_alignment_reachable_p (dr_info);
1682       if (do_peeling)
1683         {
1684           if (known_alignment_for_access_p (dr_info))
1685             {
1686               unsigned int npeel_tmp = 0;
1687               bool negative = tree_int_cst_compare (DR_STEP (dr),
1688                                                     size_zero_node) < 0;
1689
1690               vectype = STMT_VINFO_VECTYPE (stmt_info);
1691               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1692               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1693               mis = (negative
1694                      ? DR_MISALIGNMENT (dr_info)
1695                      : -DR_MISALIGNMENT (dr_info));
1696               if (DR_MISALIGNMENT (dr_info) != 0)
1697                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1698
1699               /* For multiple types, it is possible that the bigger type access
1700                  will have more than one peeling option.  E.g., a loop with two
1701                  types: one of size (vector size / 4), and the other one of
1702                  size (vector size / 8).  Vectorization factor will 8.  If both
1703                  accesses are misaligned by 3, the first one needs one scalar
1704                  iteration to be aligned, and the second one needs 5.  But the
1705                  first one will be aligned also by peeling 5 scalar
1706                  iterations, and in that case both accesses will be aligned.
1707                  Hence, except for the immediate peeling amount, we also want
1708                  to try to add full vector size, while we don't exceed
1709                  vectorization factor.
1710                  We do this automatically for cost model, since we calculate
1711                  cost for every peeling option.  */
1712               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1713                 {
1714                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1715                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1716                   possible_npeel_number
1717                     = vect_get_num_vectors (nscalars, vectype);
1718
1719                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1720                      allow peeling NELEMENTS.  */
1721                   if (DR_MISALIGNMENT (dr_info) == 0)
1722                     possible_npeel_number++;
1723                 }
1724
1725               /* Save info about DR in the hash table.  Also include peeling
1726                  amounts according to the explanation above.  */
1727               for (j = 0; j < possible_npeel_number; j++)
1728                 {
1729                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1730                                             dr_info, npeel_tmp);
1731                   npeel_tmp += target_align / dr_size;
1732                 }
1733
1734               one_misalignment_known = true;
1735             }
1736           else
1737             {
1738               /* If we don't know any misalignment values, we prefer
1739                  peeling for data-ref that has the maximum number of data-refs
1740                  with the same alignment, unless the target prefers to align
1741                  stores over load.  */
1742               unsigned same_align_drs
1743                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1744               if (!dr0_info
1745                   || same_align_drs_max < same_align_drs)
1746                 {
1747                   same_align_drs_max = same_align_drs;
1748                   dr0_info = dr_info;
1749                 }
1750               /* For data-refs with the same number of related
1751                  accesses prefer the one where the misalign
1752                  computation will be invariant in the outermost loop.  */
1753               else if (same_align_drs_max == same_align_drs)
1754                 {
1755                   struct loop *ivloop0, *ivloop;
1756                   ivloop0 = outermost_invariant_loop_for_expr
1757                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1758                   ivloop = outermost_invariant_loop_for_expr
1759                     (loop, DR_BASE_ADDRESS (dr));
1760                   if ((ivloop && !ivloop0)
1761                       || (ivloop && ivloop0
1762                           && flow_loop_nested_p (ivloop, ivloop0)))
1763                     dr0_info = dr_info;
1764                 }
1765
1766               one_misalignment_unknown = true;
1767
1768               /* Check for data refs with unsupportable alignment that
1769                  can be peeled.  */
1770               if (!supportable_dr_alignment)
1771               {
1772                 one_dr_unsupportable = true;
1773                 unsupportable_dr_info = dr_info;
1774               }
1775
1776               if (!first_store && DR_IS_WRITE (dr))
1777                 first_store = dr_info;
1778             }
1779         }
1780       else
1781         {
1782           if (!aligned_access_p (dr_info))
1783             {
1784               if (dump_enabled_p ())
1785                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786                                  "vector alignment may not be reachable\n");
1787               break;
1788             }
1789         }
1790     }
1791
1792   /* Check if we can possibly peel the loop.  */
1793   if (!vect_can_advance_ivs_p (loop_vinfo)
1794       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1795       || loop->inner)
1796     do_peeling = false;
1797
1798   struct _vect_peel_extended_info peel_for_known_alignment;
1799   struct _vect_peel_extended_info peel_for_unknown_alignment;
1800   struct _vect_peel_extended_info best_peel;
1801
1802   peel_for_unknown_alignment.inside_cost = INT_MAX;
1803   peel_for_unknown_alignment.outside_cost = INT_MAX;
1804   peel_for_unknown_alignment.peel_info.count = 0;
1805
1806   if (do_peeling
1807       && one_misalignment_unknown)
1808     {
1809       /* Check if the target requires to prefer stores over loads, i.e., if
1810          misaligned stores are more expensive than misaligned loads (taking
1811          drs with same alignment into account).  */
1812       unsigned int load_inside_cost = 0;
1813       unsigned int load_outside_cost = 0;
1814       unsigned int store_inside_cost = 0;
1815       unsigned int store_outside_cost = 0;
1816       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1817
1818       stmt_vector_for_cost dummy;
1819       dummy.create (2);
1820       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1821                                       &load_inside_cost,
1822                                       &load_outside_cost,
1823                                       &dummy, &dummy, estimated_npeels, true);
1824       dummy.release ();
1825
1826       if (first_store)
1827         {
1828           dummy.create (2);
1829           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1830                                           &store_inside_cost,
1831                                           &store_outside_cost,
1832                                           &dummy, &dummy,
1833                                           estimated_npeels, true);
1834           dummy.release ();
1835         }
1836       else
1837         {
1838           store_inside_cost = INT_MAX;
1839           store_outside_cost = INT_MAX;
1840         }
1841
1842       if (load_inside_cost > store_inside_cost
1843           || (load_inside_cost == store_inside_cost
1844               && load_outside_cost > store_outside_cost))
1845         {
1846           dr0_info = first_store;
1847           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1848           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1849         }
1850       else
1851         {
1852           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1853           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1854         }
1855
1856       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1857       prologue_cost_vec.create (2);
1858       epilogue_cost_vec.create (2);
1859
1860       int dummy2;
1861       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1862         (loop_vinfo, estimated_npeels, &dummy2,
1863          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1864          &prologue_cost_vec, &epilogue_cost_vec);
1865
1866       prologue_cost_vec.release ();
1867       epilogue_cost_vec.release ();
1868
1869       peel_for_unknown_alignment.peel_info.count = 1
1870         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1871     }
1872
1873   peel_for_unknown_alignment.peel_info.npeel = 0;
1874   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1875
1876   best_peel = peel_for_unknown_alignment;
1877
1878   peel_for_known_alignment.inside_cost = INT_MAX;
1879   peel_for_known_alignment.outside_cost = INT_MAX;
1880   peel_for_known_alignment.peel_info.count = 0;
1881   peel_for_known_alignment.peel_info.dr_info = NULL;
1882
1883   if (do_peeling && one_misalignment_known)
1884     {
1885       /* Peeling is possible, but there is no data access that is not supported
1886          unless aligned.  So we try to choose the best possible peeling from
1887          the hash table.  */
1888       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1889         (&peeling_htab, loop_vinfo);
1890     }
1891
1892   /* Compare costs of peeling for known and unknown alignment. */
1893   if (peel_for_known_alignment.peel_info.dr_info != NULL
1894       && peel_for_unknown_alignment.inside_cost
1895       >= peel_for_known_alignment.inside_cost)
1896     {
1897       best_peel = peel_for_known_alignment;
1898
1899       /* If the best peeling for known alignment has NPEEL == 0, perform no
1900          peeling at all except if there is an unsupportable dr that we can
1901          align.  */
1902       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1903         do_peeling = false;
1904     }
1905
1906   /* If there is an unsupportable data ref, prefer this over all choices so far
1907      since we'd have to discard a chosen peeling except when it accidentally
1908      aligned the unsupportable data ref.  */
1909   if (one_dr_unsupportable)
1910     dr0_info = unsupportable_dr_info;
1911   else if (do_peeling)
1912     {
1913       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1914          TODO: Use nopeel_outside_cost or get rid of it?  */
1915       unsigned nopeel_inside_cost = 0;
1916       unsigned nopeel_outside_cost = 0;
1917
1918       stmt_vector_for_cost dummy;
1919       dummy.create (2);
1920       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1921                                       &nopeel_outside_cost, &dummy, &dummy,
1922                                       0, false);
1923       dummy.release ();
1924
1925       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1926          costs will be recorded.  */
1927       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1928       prologue_cost_vec.create (2);
1929       epilogue_cost_vec.create (2);
1930
1931       int dummy2;
1932       nopeel_outside_cost += vect_get_known_peeling_cost
1933         (loop_vinfo, 0, &dummy2,
1934          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1935          &prologue_cost_vec, &epilogue_cost_vec);
1936
1937       prologue_cost_vec.release ();
1938       epilogue_cost_vec.release ();
1939
1940       npeel = best_peel.peel_info.npeel;
1941       dr0_info = best_peel.peel_info.dr_info;
1942
1943       /* If no peeling is not more expensive than the best peeling we
1944          have so far, don't perform any peeling.  */
1945       if (nopeel_inside_cost <= best_peel.inside_cost)
1946         do_peeling = false;
1947     }
1948
1949   if (do_peeling)
1950     {
1951       stmt_vec_info stmt_info = dr0_info->stmt;
1952       vectype = STMT_VINFO_VECTYPE (stmt_info);
1953
1954       if (known_alignment_for_access_p (dr0_info))
1955         {
1956           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1957                                                 size_zero_node) < 0;
1958           if (!npeel)
1959             {
1960               /* Since it's known at compile time, compute the number of
1961                  iterations in the peeled loop (the peeling factor) for use in
1962                  updating DR_MISALIGNMENT values.  The peeling factor is the
1963                  vectorization factor minus the misalignment as an element
1964                  count.  */
1965               mis = (negative
1966                      ? DR_MISALIGNMENT (dr0_info)
1967                      : -DR_MISALIGNMENT (dr0_info));
1968               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1969               npeel = ((mis & (target_align - 1))
1970                        / vect_get_scalar_dr_size (dr0_info));
1971             }
1972
1973           /* For interleaved data access every iteration accesses all the
1974              members of the group, therefore we divide the number of iterations
1975              by the group size.  */
1976           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1977             npeel /= DR_GROUP_SIZE (stmt_info);
1978
1979           if (dump_enabled_p ())
1980             dump_printf_loc (MSG_NOTE, vect_location,
1981                              "Try peeling by %d\n", npeel);
1982         }
1983
1984       /* Ensure that all datarefs can be vectorized after the peel.  */
1985       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1986         do_peeling = false;
1987
1988       /* Check if all datarefs are supportable and log.  */
1989       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1990         {
1991           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1992           if (!stat)
1993             do_peeling = false;
1994           else
1995             return stat;
1996         }
1997
1998       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1999       if (do_peeling)
2000         {
2001           unsigned max_allowed_peel
2002             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2003           if (max_allowed_peel != (unsigned)-1)
2004             {
2005               unsigned max_peel = npeel;
2006               if (max_peel == 0)
2007                 {
2008                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2009                   max_peel = (target_align
2010                               / vect_get_scalar_dr_size (dr0_info) - 1);
2011                 }
2012               if (max_peel > max_allowed_peel)
2013                 {
2014                   do_peeling = false;
2015                   if (dump_enabled_p ())
2016                     dump_printf_loc (MSG_NOTE, vect_location,
2017                         "Disable peeling, max peels reached: %d\n", max_peel);
2018                 }
2019             }
2020         }
2021
2022       /* Cost model #2 - if peeling may result in a remaining loop not
2023          iterating enough to be vectorized then do not peel.  Since this
2024          is a cost heuristic rather than a correctness decision, use the
2025          most likely runtime value for variable vectorization factors.  */
2026       if (do_peeling
2027           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2028         {
2029           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2030           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2031           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2032               < assumed_vf + max_peel)
2033             do_peeling = false;
2034         }
2035
2036       if (do_peeling)
2037         {
2038           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2039              If the misalignment of DR_i is identical to that of dr0 then set
2040              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2041              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2042              by the peeling factor times the element size of DR_i (MOD the
2043              vectorization factor times the size).  Otherwise, the
2044              misalignment of DR_i must be set to unknown.  */
2045           FOR_EACH_VEC_ELT (datarefs, i, dr)
2046             if (dr != dr0_info->dr)
2047               {
2048                 /* Strided accesses perform only component accesses, alignment
2049                    is irrelevant for them.  */
2050                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2051                 stmt_info = dr_info->stmt;
2052                 if (STMT_VINFO_STRIDED_P (stmt_info)
2053                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054                   continue;
2055
2056                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2057               }
2058
2059           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2060           if (npeel)
2061             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2062           else
2063             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2064               = DR_MISALIGNMENT (dr0_info);
2065           SET_DR_MISALIGNMENT (dr0_info, 0);
2066           if (dump_enabled_p ())
2067             {
2068               dump_printf_loc (MSG_NOTE, vect_location,
2069                                "Alignment of access forced using peeling.\n");
2070               dump_printf_loc (MSG_NOTE, vect_location,
2071                                "Peeling for alignment will be applied.\n");
2072             }
2073
2074           /* The inside-loop cost will be accounted for in vectorizable_load
2075              and vectorizable_store correctly with adjusted alignments.
2076              Drop the body_cst_vec on the floor here.  */
2077           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078           gcc_assert (stat);
2079           return stat;
2080         }
2081     }
2082
2083   /* (2) Versioning to force alignment.  */
2084
2085   /* Try versioning if:
2086      1) optimize loop for speed
2087      2) there is at least one unsupported misaligned data ref with an unknown
2088         misalignment, and
2089      3) all misaligned data refs with a known misalignment are supported, and
2090      4) the number of runtime alignment checks is within reason.  */
2091
2092   do_versioning =
2093         optimize_loop_nest_for_speed_p (loop)
2094         && (!loop->inner); /* FORNOW */
2095
2096   if (do_versioning)
2097     {
2098       FOR_EACH_VEC_ELT (datarefs, i, dr)
2099         {
2100           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2101           stmt_vec_info stmt_info = dr_info->stmt;
2102
2103           /* For interleaving, only the alignment of the first access
2104              matters.  */
2105           if (aligned_access_p (dr_info)
2106               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2107                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2108             continue;
2109
2110           if (STMT_VINFO_STRIDED_P (stmt_info))
2111             {
2112               /* Strided loads perform only component accesses, alignment is
2113                  irrelevant for them.  */
2114               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115                 continue;
2116               do_versioning = false;
2117               break;
2118             }
2119
2120           supportable_dr_alignment
2121             = vect_supportable_dr_alignment (dr_info, false);
2122
2123           if (!supportable_dr_alignment)
2124             {
2125               int mask;
2126               tree vectype;
2127
2128               if (known_alignment_for_access_p (dr_info)
2129                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2130                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2131                 {
2132                   do_versioning = false;
2133                   break;
2134                 }
2135
2136               vectype = STMT_VINFO_VECTYPE (stmt_info);
2137               gcc_assert (vectype);
2138
2139               /* At present we don't support versioning for alignment
2140                  with variable VF, since there's no guarantee that the
2141                  VF is a power of two.  We could relax this if we added
2142                  a way of enforcing a power-of-two size.  */
2143               unsigned HOST_WIDE_INT size;
2144               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2145                 {
2146                   do_versioning = false;
2147                   break;
2148                 }
2149
2150               /* The rightmost bits of an aligned address must be zeros.
2151                  Construct the mask needed for this test.  For example,
2152                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2153                  mask must be 15 = 0xf. */
2154               mask = size - 1;
2155
2156               /* FORNOW: use the same mask to test all potentially unaligned
2157                  references in the loop.  The vectorizer currently supports
2158                  a single vector size, see the reference to
2159                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2160                  vectorization factor is computed.  */
2161               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2162                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2163               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2164               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2165             }
2166         }
2167
2168       /* Versioning requires at least one misaligned data reference.  */
2169       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2170         do_versioning = false;
2171       else if (!do_versioning)
2172         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2173     }
2174
2175   if (do_versioning)
2176     {
2177       vec<stmt_vec_info> may_misalign_stmts
2178         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2179       stmt_vec_info stmt_info;
2180
2181       /* It can now be assumed that the data references in the statements
2182          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2183          of the loop being vectorized.  */
2184       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2185         {
2186           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2187           SET_DR_MISALIGNMENT (dr_info, 0);
2188           if (dump_enabled_p ())
2189             dump_printf_loc (MSG_NOTE, vect_location,
2190                              "Alignment of access forced using versioning.\n");
2191         }
2192
2193       if (dump_enabled_p ())
2194         dump_printf_loc (MSG_NOTE, vect_location,
2195                          "Versioning for alignment will be applied.\n");
2196
2197       /* Peeling and versioning can't be done together at this time.  */
2198       gcc_assert (! (do_peeling && do_versioning));
2199
2200       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2201       gcc_assert (stat);
2202       return stat;
2203     }
2204
2205   /* This point is reached if neither peeling nor versioning is being done.  */
2206   gcc_assert (! (do_peeling || do_versioning));
2207
2208   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2209   return stat;
2210 }
2211
2212
2213 /* Function vect_find_same_alignment_drs.
2214
2215    Update group and alignment relations in VINFO according to the chosen
2216    vectorization factor.  */
2217
2218 static void
2219 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2220 {
2221   struct data_reference *dra = DDR_A (ddr);
2222   struct data_reference *drb = DDR_B (ddr);
2223   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2224   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2225   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2226   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2227
2228   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2229     return;
2230
2231   if (dra == drb)
2232     return;
2233
2234   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2235       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2236     return;
2237
2238   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2239       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2240       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2241     return;
2242
2243   /* Two references with distance zero have the same alignment.  */
2244   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2245                           - wi::to_poly_offset (DR_INIT (drb)));
2246   if (maybe_ne (diff, 0))
2247     {
2248       /* Get the wider of the two alignments.  */
2249       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2250                               / BITS_PER_UNIT);
2251       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2252                               / BITS_PER_UNIT);
2253       unsigned int max_align = MAX (align_a, align_b);
2254
2255       /* Require the gap to be a multiple of the larger vector alignment.  */
2256       if (!multiple_p (diff, max_align))
2257         return;
2258     }
2259
2260   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2261   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2262   if (dump_enabled_p ())
2263     dump_printf_loc (MSG_NOTE, vect_location,
2264                      "accesses have the same alignment: %T and %T\n",
2265                      DR_REF (dra), DR_REF (drb));
2266 }
2267
2268
2269 /* Function vect_analyze_data_refs_alignment
2270
2271    Analyze the alignment of the data-references in the loop.
2272    Return FALSE if a data reference is found that cannot be vectorized.  */
2273
2274 opt_result
2275 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2276 {
2277   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2278
2279   /* Mark groups of data references with same alignment using
2280      data dependence information.  */
2281   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2282   struct data_dependence_relation *ddr;
2283   unsigned int i;
2284
2285   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2286     vect_find_same_alignment_drs (vinfo, ddr);
2287
2288   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2289   struct data_reference *dr;
2290
2291   vect_record_base_alignments (vinfo);
2292   FOR_EACH_VEC_ELT (datarefs, i, dr)
2293     {
2294       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2295       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2296         vect_compute_data_ref_alignment (dr_info);
2297     }
2298
2299   return opt_result::success ();
2300 }
2301
2302
2303 /* Analyze alignment of DRs of stmts in NODE.  */
2304
2305 static bool
2306 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2307 {
2308   /* We vectorize from the first scalar stmt in the node unless
2309      the node is permuted in which case we start from the first
2310      element in the group.  */
2311   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2312   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2313   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2314     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2315
2316   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2317   vect_compute_data_ref_alignment (dr_info);
2318   /* For creating the data-ref pointer we need alignment of the
2319      first element anyway.  */
2320   if (dr_info != first_dr_info)
2321     vect_compute_data_ref_alignment (first_dr_info);
2322   if (! verify_data_ref_alignment (dr_info))
2323     {
2324       if (dump_enabled_p ())
2325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326                          "not vectorized: bad data alignment in basic "
2327                          "block.\n");
2328       return false;
2329     }
2330
2331   return true;
2332 }
2333
2334 /* Function vect_slp_analyze_instance_alignment
2335
2336    Analyze the alignment of the data-references in the SLP instance.
2337    Return FALSE if a data reference is found that cannot be vectorized.  */
2338
2339 bool
2340 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2341 {
2342   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2343
2344   slp_tree node;
2345   unsigned i;
2346   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2347     if (! vect_slp_analyze_and_verify_node_alignment (node))
2348       return false;
2349
2350   node = SLP_INSTANCE_TREE (instance);
2351   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2352       && ! vect_slp_analyze_and_verify_node_alignment
2353              (SLP_INSTANCE_TREE (instance)))
2354     return false;
2355
2356   return true;
2357 }
2358
2359
2360 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2361    accesses of legal size, step, etc.  Detect gaps, single element
2362    interleaving, and other special cases. Set grouped access info.
2363    Collect groups of strided stores for further use in SLP analysis.
2364    Worker for vect_analyze_group_access.  */
2365
2366 static bool
2367 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2368 {
2369   data_reference *dr = dr_info->dr;
2370   tree step = DR_STEP (dr);
2371   tree scalar_type = TREE_TYPE (DR_REF (dr));
2372   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2373   stmt_vec_info stmt_info = dr_info->stmt;
2374   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2375   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2376   HOST_WIDE_INT dr_step = -1;
2377   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2378   bool slp_impossible = false;
2379
2380   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2381      size of the interleaving group (including gaps).  */
2382   if (tree_fits_shwi_p (step))
2383     {
2384       dr_step = tree_to_shwi (step);
2385       /* Check that STEP is a multiple of type size.  Otherwise there is
2386          a non-element-sized gap at the end of the group which we
2387          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2388          ???  As we can handle non-constant step fine here we should
2389          simply remove uses of DR_GROUP_GAP between the last and first
2390          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2391          simply not include that gap.  */
2392       if ((dr_step % type_size) != 0)
2393         {
2394           if (dump_enabled_p ())
2395             dump_printf_loc (MSG_NOTE, vect_location,
2396                              "Step %T is not a multiple of the element size"
2397                              " for %T\n",
2398                              step, DR_REF (dr));
2399           return false;
2400         }
2401       groupsize = absu_hwi (dr_step) / type_size;
2402     }
2403   else
2404     groupsize = 0;
2405
2406   /* Not consecutive access is possible only if it is a part of interleaving.  */
2407   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2408     {
2409       /* Check if it this DR is a part of interleaving, and is a single
2410          element of the group that is accessed in the loop.  */
2411
2412       /* Gaps are supported only for loads. STEP must be a multiple of the type
2413          size.  */
2414       if (DR_IS_READ (dr)
2415           && (dr_step % type_size) == 0
2416           && groupsize > 0)
2417         {
2418           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2419           DR_GROUP_SIZE (stmt_info) = groupsize;
2420           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2421           if (dump_enabled_p ())
2422             dump_printf_loc (MSG_NOTE, vect_location,
2423                              "Detected single element interleaving %T"
2424                              " step %T\n",
2425                              DR_REF (dr), step);
2426
2427           return true;
2428         }
2429
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "not consecutive access %G", stmt_info->stmt);
2433
2434       if (bb_vinfo)
2435         {
2436           /* Mark the statement as unvectorizable.  */
2437           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2438           return true;
2439         }
2440
2441       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2442       STMT_VINFO_STRIDED_P (stmt_info) = true;
2443       return true;
2444     }
2445
2446   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2447     {
2448       /* First stmt in the interleaving chain. Check the chain.  */
2449       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2450       struct data_reference *data_ref = dr;
2451       unsigned int count = 1;
2452       tree prev_init = DR_INIT (data_ref);
2453       stmt_vec_info prev = stmt_info;
2454       HOST_WIDE_INT diff, gaps = 0;
2455
2456       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2457       while (next)
2458         {
2459           /* Skip same data-refs.  In case that two or more stmts share
2460              data-ref (supported only for loads), we vectorize only the first
2461              stmt, and the rest get their vectorized loads from the first
2462              one.  */
2463           if (!tree_int_cst_compare (DR_INIT (data_ref),
2464                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2465             {
2466               if (DR_IS_WRITE (data_ref))
2467                 {
2468                   if (dump_enabled_p ())
2469                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2470                                      "Two store stmts share the same dr.\n");
2471                   return false;
2472                 }
2473
2474               if (dump_enabled_p ())
2475                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2476                                  "Two or more load stmts share the same dr.\n");
2477
2478               /* For load use the same data-ref load.  */
2479               DR_GROUP_SAME_DR_STMT (next) = prev;
2480
2481               prev = next;
2482               next = DR_GROUP_NEXT_ELEMENT (next);
2483               continue;
2484             }
2485
2486           prev = next;
2487           data_ref = STMT_VINFO_DATA_REF (next);
2488
2489           /* All group members have the same STEP by construction.  */
2490           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2491
2492           /* Check that the distance between two accesses is equal to the type
2493              size. Otherwise, we have gaps.  */
2494           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2495                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2496           if (diff != 1)
2497             {
2498               /* FORNOW: SLP of accesses with gaps is not supported.  */
2499               slp_impossible = true;
2500               if (DR_IS_WRITE (data_ref))
2501                 {
2502                   if (dump_enabled_p ())
2503                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504                                      "interleaved store with gaps\n");
2505                   return false;
2506                 }
2507
2508               gaps += diff - 1;
2509             }
2510
2511           last_accessed_element += diff;
2512
2513           /* Store the gap from the previous member of the group. If there is no
2514              gap in the access, DR_GROUP_GAP is always 1.  */
2515           DR_GROUP_GAP (next) = diff;
2516
2517           prev_init = DR_INIT (data_ref);
2518           next = DR_GROUP_NEXT_ELEMENT (next);
2519           /* Count the number of data-refs in the chain.  */
2520           count++;
2521         }
2522
2523       if (groupsize == 0)
2524         groupsize = count + gaps;
2525
2526       /* This could be UINT_MAX but as we are generating code in a very
2527          inefficient way we have to cap earlier.  See PR78699 for example.  */
2528       if (groupsize > 4096)
2529         {
2530           if (dump_enabled_p ())
2531             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2532                              "group is too large\n");
2533           return false;
2534         }
2535
2536       /* Check that the size of the interleaving is equal to count for stores,
2537          i.e., that there are no gaps.  */
2538       if (groupsize != count
2539           && !DR_IS_READ (dr))
2540         {
2541           groupsize = count;
2542           STMT_VINFO_STRIDED_P (stmt_info) = true;
2543         }
2544
2545       /* If there is a gap after the last load in the group it is the
2546          difference between the groupsize and the last accessed
2547          element.
2548          When there is no gap, this difference should be 0.  */
2549       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2550
2551       DR_GROUP_SIZE (stmt_info) = groupsize;
2552       if (dump_enabled_p ())
2553         {
2554           dump_printf_loc (MSG_NOTE, vect_location,
2555                            "Detected interleaving ");
2556           if (DR_IS_READ (dr))
2557             dump_printf (MSG_NOTE, "load ");
2558           else if (STMT_VINFO_STRIDED_P (stmt_info))
2559             dump_printf (MSG_NOTE, "strided store ");
2560           else
2561             dump_printf (MSG_NOTE, "store ");
2562           dump_printf (MSG_NOTE, "of size %u starting with %G",
2563                        (unsigned)groupsize, stmt_info->stmt);
2564           if (DR_GROUP_GAP (stmt_info) != 0)
2565             dump_printf_loc (MSG_NOTE, vect_location,
2566                              "There is a gap of %u elements after the group\n",
2567                              DR_GROUP_GAP (stmt_info));
2568         }
2569
2570       /* SLP: create an SLP data structure for every interleaving group of
2571          stores for further analysis in vect_analyse_slp.  */
2572       if (DR_IS_WRITE (dr) && !slp_impossible)
2573         {
2574           if (loop_vinfo)
2575             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2576           if (bb_vinfo)
2577             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2578         }
2579     }
2580
2581   return true;
2582 }
2583
2584 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2585    accesses of legal size, step, etc.  Detect gaps, single element
2586    interleaving, and other special cases. Set grouped access info.
2587    Collect groups of strided stores for further use in SLP analysis.  */
2588
2589 static bool
2590 vect_analyze_group_access (dr_vec_info *dr_info)
2591 {
2592   if (!vect_analyze_group_access_1 (dr_info))
2593     {
2594       /* Dissolve the group if present.  */
2595       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2596       while (stmt_info)
2597         {
2598           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2599           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2600           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2601           stmt_info = next;
2602         }
2603       return false;
2604     }
2605   return true;
2606 }
2607
2608 /* Analyze the access pattern of the data-reference DR_INFO.
2609    In case of non-consecutive accesses call vect_analyze_group_access() to
2610    analyze groups of accesses.  */
2611
2612 static bool
2613 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2614 {
2615   data_reference *dr = dr_info->dr;
2616   tree step = DR_STEP (dr);
2617   tree scalar_type = TREE_TYPE (DR_REF (dr));
2618   stmt_vec_info stmt_info = dr_info->stmt;
2619   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2620   struct loop *loop = NULL;
2621
2622   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2623     return true;
2624
2625   if (loop_vinfo)
2626     loop = LOOP_VINFO_LOOP (loop_vinfo);
2627
2628   if (loop_vinfo && !step)
2629     {
2630       if (dump_enabled_p ())
2631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2632                          "bad data-ref access in loop\n");
2633       return false;
2634     }
2635
2636   /* Allow loads with zero step in inner-loop vectorization.  */
2637   if (loop_vinfo && integer_zerop (step))
2638     {
2639       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2640       if (!nested_in_vect_loop_p (loop, stmt_info))
2641         return DR_IS_READ (dr);
2642       /* Allow references with zero step for outer loops marked
2643          with pragma omp simd only - it guarantees absence of
2644          loop-carried dependencies between inner loop iterations.  */
2645       if (loop->safelen < 2)
2646         {
2647           if (dump_enabled_p ())
2648             dump_printf_loc (MSG_NOTE, vect_location,
2649                              "zero step in inner loop of nest\n");
2650           return false;
2651         }
2652     }
2653
2654   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2655     {
2656       /* Interleaved accesses are not yet supported within outer-loop
2657         vectorization for references in the inner-loop.  */
2658       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2659
2660       /* For the rest of the analysis we use the outer-loop step.  */
2661       step = STMT_VINFO_DR_STEP (stmt_info);
2662       if (integer_zerop (step))
2663         {
2664           if (dump_enabled_p ())
2665             dump_printf_loc (MSG_NOTE, vect_location,
2666                              "zero step in outer loop.\n");
2667           return DR_IS_READ (dr);
2668         }
2669     }
2670
2671   /* Consecutive?  */
2672   if (TREE_CODE (step) == INTEGER_CST)
2673     {
2674       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2675       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2676           || (dr_step < 0
2677               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2678         {
2679           /* Mark that it is not interleaving.  */
2680           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2681           return true;
2682         }
2683     }
2684
2685   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2686     {
2687       if (dump_enabled_p ())
2688         dump_printf_loc (MSG_NOTE, vect_location,
2689                          "grouped access in outer loop.\n");
2690       return false;
2691     }
2692
2693
2694   /* Assume this is a DR handled by non-constant strided load case.  */
2695   if (TREE_CODE (step) != INTEGER_CST)
2696     return (STMT_VINFO_STRIDED_P (stmt_info)
2697             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2698                 || vect_analyze_group_access (dr_info)));
2699
2700   /* Not consecutive access - check if it's a part of interleaving group.  */
2701   return vect_analyze_group_access (dr_info);
2702 }
2703
2704 /* Compare two data-references DRA and DRB to group them into chunks
2705    suitable for grouping.  */
2706
2707 static int
2708 dr_group_sort_cmp (const void *dra_, const void *drb_)
2709 {
2710   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2711   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2712   int cmp;
2713
2714   /* Stabilize sort.  */
2715   if (dra == drb)
2716     return 0;
2717
2718   /* DRs in different loops never belong to the same group.  */
2719   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2720   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2721   if (loopa != loopb)
2722     return loopa->num < loopb->num ? -1 : 1;
2723
2724   /* Ordering of DRs according to base.  */
2725   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2726                                DR_BASE_ADDRESS (drb));
2727   if (cmp != 0)
2728     return cmp;
2729
2730   /* And according to DR_OFFSET.  */
2731   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2732   if (cmp != 0)
2733     return cmp;
2734
2735   /* Put reads before writes.  */
2736   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2737     return DR_IS_READ (dra) ? -1 : 1;
2738
2739   /* Then sort after access size.  */
2740   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2741                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2742   if (cmp != 0)
2743     return cmp;
2744
2745   /* And after step.  */
2746   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2747   if (cmp != 0)
2748     return cmp;
2749
2750   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2751   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2752   if (cmp == 0)
2753     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2754   return cmp;
2755 }
2756
2757 /* If OP is the result of a conversion, return the unconverted value,
2758    otherwise return null.  */
2759
2760 static tree
2761 strip_conversion (tree op)
2762 {
2763   if (TREE_CODE (op) != SSA_NAME)
2764     return NULL_TREE;
2765   gimple *stmt = SSA_NAME_DEF_STMT (op);
2766   if (!is_gimple_assign (stmt)
2767       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2768     return NULL_TREE;
2769   return gimple_assign_rhs1 (stmt);
2770 }
2771
2772 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2773    and STMT2_INFO being in a single group.  */
2774
2775 static bool
2776 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2777 {
2778   if (gimple_assign_single_p (stmt1_info->stmt))
2779     return gimple_assign_single_p (stmt2_info->stmt);
2780
2781   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2782   if (call1 && gimple_call_internal_p (call1))
2783     {
2784       /* Check for two masked loads or two masked stores.  */
2785       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2786       if (!call2 || !gimple_call_internal_p (call2))
2787         return false;
2788       internal_fn ifn = gimple_call_internal_fn (call1);
2789       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2790         return false;
2791       if (ifn != gimple_call_internal_fn (call2))
2792         return false;
2793
2794       /* Check that the masks are the same.  Cope with casts of masks,
2795          like those created by build_mask_conversion.  */
2796       tree mask1 = gimple_call_arg (call1, 2);
2797       tree mask2 = gimple_call_arg (call2, 2);
2798       if (!operand_equal_p (mask1, mask2, 0))
2799         {
2800           mask1 = strip_conversion (mask1);
2801           if (!mask1)
2802             return false;
2803           mask2 = strip_conversion (mask2);
2804           if (!mask2)
2805             return false;
2806           if (!operand_equal_p (mask1, mask2, 0))
2807             return false;
2808         }
2809       return true;
2810     }
2811
2812   return false;
2813 }
2814
2815 /* Function vect_analyze_data_ref_accesses.
2816
2817    Analyze the access pattern of all the data references in the loop.
2818
2819    FORNOW: the only access pattern that is considered vectorizable is a
2820            simple step 1 (consecutive) access.
2821
2822    FORNOW: handle only arrays and pointer accesses.  */
2823
2824 opt_result
2825 vect_analyze_data_ref_accesses (vec_info *vinfo)
2826 {
2827   unsigned int i;
2828   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2829   struct data_reference *dr;
2830
2831   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2832
2833   if (datarefs.is_empty ())
2834     return opt_result::success ();
2835
2836   /* Sort the array of datarefs to make building the interleaving chains
2837      linear.  Don't modify the original vector's order, it is needed for
2838      determining what dependencies are reversed.  */
2839   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2840   datarefs_copy.qsort (dr_group_sort_cmp);
2841
2842   /* Build the interleaving chains.  */
2843   for (i = 0; i < datarefs_copy.length () - 1;)
2844     {
2845       data_reference_p dra = datarefs_copy[i];
2846       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2847       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2848       stmt_vec_info lastinfo = NULL;
2849       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2850           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2851         {
2852           ++i;
2853           continue;
2854         }
2855       for (i = i + 1; i < datarefs_copy.length (); ++i)
2856         {
2857           data_reference_p drb = datarefs_copy[i];
2858           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2859           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2860           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2861               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2862             break;
2863
2864           /* ???  Imperfect sorting (non-compatible types, non-modulo
2865              accesses, same accesses) can lead to a group to be artificially
2866              split here as we don't just skip over those.  If it really
2867              matters we can push those to a worklist and re-iterate
2868              over them.  The we can just skip ahead to the next DR here.  */
2869
2870           /* DRs in a different loop should not be put into the same
2871              interleaving group.  */
2872           if (gimple_bb (DR_STMT (dra))->loop_father
2873               != gimple_bb (DR_STMT (drb))->loop_father)
2874             break;
2875
2876           /* Check that the data-refs have same first location (except init)
2877              and they are both either store or load (not load and store,
2878              not masked loads or stores).  */
2879           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2880               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2881                                         DR_BASE_ADDRESS (drb)) != 0
2882               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2883               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2884             break;
2885
2886           /* Check that the data-refs have the same constant size.  */
2887           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2888           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2889           if (!tree_fits_uhwi_p (sza)
2890               || !tree_fits_uhwi_p (szb)
2891               || !tree_int_cst_equal (sza, szb))
2892             break;
2893
2894           /* Check that the data-refs have the same step.  */
2895           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2896             break;
2897
2898           /* Check the types are compatible.
2899              ???  We don't distinguish this during sorting.  */
2900           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2901                                    TREE_TYPE (DR_REF (drb))))
2902             break;
2903
2904           /* Check that the DR_INITs are compile-time constants.  */
2905           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2906               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2907             break;
2908
2909           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2910           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2911           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2912           HOST_WIDE_INT init_prev
2913             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2914           gcc_assert (init_a <= init_b
2915                       && init_a <= init_prev
2916                       && init_prev <= init_b);
2917
2918           /* Do not place the same access in the interleaving chain twice.  */
2919           if (init_b == init_prev)
2920             {
2921               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2922                           < gimple_uid (DR_STMT (drb)));
2923               /* ???  For now we simply "drop" the later reference which is
2924                  otherwise the same rather than finishing off this group.
2925                  In the end we'd want to re-process duplicates forming
2926                  multiple groups from the refs, likely by just collecting
2927                  all candidates (including duplicates and split points
2928                  below) in a vector and then process them together.  */
2929               continue;
2930             }
2931
2932           /* If init_b == init_a + the size of the type * k, we have an
2933              interleaving, and DRA is accessed before DRB.  */
2934           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2935           if (type_size_a == 0
2936               || (init_b - init_a) % type_size_a != 0)
2937             break;
2938
2939           /* If we have a store, the accesses are adjacent.  This splits
2940              groups into chunks we support (we don't support vectorization
2941              of stores with gaps).  */
2942           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2943             break;
2944
2945           /* If the step (if not zero or non-constant) is greater than the
2946              difference between data-refs' inits this splits groups into
2947              suitable sizes.  */
2948           if (tree_fits_shwi_p (DR_STEP (dra)))
2949             {
2950               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2951               if (step != 0 && step <= (init_b - init_a))
2952                 break;
2953             }
2954
2955           if (dump_enabled_p ())
2956             dump_printf_loc (MSG_NOTE, vect_location,
2957                              DR_IS_READ (dra)
2958                              ? "Detected interleaving load %T and %T\n"
2959                              : "Detected interleaving store %T and %T\n",
2960                              DR_REF (dra), DR_REF (drb));
2961
2962           /* Link the found element into the group list.  */
2963           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2964             {
2965               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2966               lastinfo = stmtinfo_a;
2967             }
2968           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2969           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2970           lastinfo = stmtinfo_b;
2971         }
2972     }
2973
2974   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2975     {
2976       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2977       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
2978           && !vect_analyze_data_ref_access (dr_info))
2979         {
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982                              "not vectorized: complicated access pattern.\n");
2983
2984           if (is_a <bb_vec_info> (vinfo))
2985             {
2986               /* Mark the statement as not vectorizable.  */
2987               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
2988               continue;
2989             }
2990           else
2991             {
2992               datarefs_copy.release ();
2993               return opt_result::failure_at (dr_info->stmt->stmt,
2994                                              "not vectorized:"
2995                                              " complicated access pattern.\n");
2996             }
2997         }
2998     }
2999
3000   datarefs_copy.release ();
3001   return opt_result::success ();
3002 }
3003
3004 /* Function vect_vfa_segment_size.
3005
3006    Input:
3007      DR_INFO: The data reference.
3008      LENGTH_FACTOR: segment length to consider.
3009
3010    Return a value suitable for the dr_with_seg_len::seg_len field.
3011    This is the "distance travelled" by the pointer from the first
3012    iteration in the segment to the last.  Note that it does not include
3013    the size of the access; in effect it only describes the first byte.  */
3014
3015 static tree
3016 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3017 {
3018   length_factor = size_binop (MINUS_EXPR,
3019                               fold_convert (sizetype, length_factor),
3020                               size_one_node);
3021   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3022                      length_factor);
3023 }
3024
3025 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3026    gives the worst-case number of bytes covered by the segment.  */
3027
3028 static unsigned HOST_WIDE_INT
3029 vect_vfa_access_size (dr_vec_info *dr_info)
3030 {
3031   stmt_vec_info stmt_vinfo = dr_info->stmt;
3032   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3033   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3034   unsigned HOST_WIDE_INT access_size = ref_size;
3035   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3036     {
3037       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3038       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3039     }
3040   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3041       && (vect_supportable_dr_alignment (dr_info, false)
3042           == dr_explicit_realign_optimized))
3043     {
3044       /* We might access a full vector's worth.  */
3045       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3046       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3047     }
3048   return access_size;
3049 }
3050
3051 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3052    describes.  */
3053
3054 static unsigned int
3055 vect_vfa_align (dr_vec_info *dr_info)
3056 {
3057   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3058 }
3059
3060 /* Function vect_no_alias_p.
3061
3062    Given data references A and B with equal base and offset, see whether
3063    the alias relation can be decided at compilation time.  Return 1 if
3064    it can and the references alias, 0 if it can and the references do
3065    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3066    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3067    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3068
3069 static int
3070 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3071                          tree segment_length_a, tree segment_length_b,
3072                          unsigned HOST_WIDE_INT access_size_a,
3073                          unsigned HOST_WIDE_INT access_size_b)
3074 {
3075   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3076   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3077   poly_uint64 const_length_a;
3078   poly_uint64 const_length_b;
3079
3080   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3081      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3082      [a, a+12) */
3083   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3084     {
3085       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3086       offset_a = (offset_a + access_size_a) - const_length_a;
3087     }
3088   else
3089     const_length_a = tree_to_poly_uint64 (segment_length_a);
3090   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3091     {
3092       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3093       offset_b = (offset_b + access_size_b) - const_length_b;
3094     }
3095   else
3096     const_length_b = tree_to_poly_uint64 (segment_length_b);
3097
3098   const_length_a += access_size_a;
3099   const_length_b += access_size_b;
3100
3101   if (ranges_known_overlap_p (offset_a, const_length_a,
3102                               offset_b, const_length_b))
3103     return 1;
3104
3105   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3106                                offset_b, const_length_b))
3107     return 0;
3108
3109   return -1;
3110 }
3111
3112 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3113    in DDR is >= VF.  */
3114
3115 static bool
3116 dependence_distance_ge_vf (data_dependence_relation *ddr,
3117                            unsigned int loop_depth, poly_uint64 vf)
3118 {
3119   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3120       || DDR_NUM_DIST_VECTS (ddr) == 0)
3121     return false;
3122
3123   /* If the dependence is exact, we should have limited the VF instead.  */
3124   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3125
3126   unsigned int i;
3127   lambda_vector dist_v;
3128   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3129     {
3130       HOST_WIDE_INT dist = dist_v[loop_depth];
3131       if (dist != 0
3132           && !(dist > 0 && DDR_REVERSED_P (ddr))
3133           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3134         return false;
3135     }
3136
3137   if (dump_enabled_p ())
3138     dump_printf_loc (MSG_NOTE, vect_location,
3139                      "dependence distance between %T and %T is >= VF\n",
3140                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3141
3142   return true;
3143 }
3144
3145 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3146
3147 static void
3148 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3149 {
3150   dump_printf (dump_kind, "%s (%T) >= ",
3151                lower_bound.unsigned_p ? "unsigned" : "abs",
3152                lower_bound.expr);
3153   dump_dec (dump_kind, lower_bound.min_value);
3154 }
3155
3156 /* Record that the vectorized loop requires the vec_lower_bound described
3157    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3158
3159 static void
3160 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3161                         poly_uint64 min_value)
3162 {
3163   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3164   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3165     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3166       {
3167         unsigned_p &= lower_bounds[i].unsigned_p;
3168         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3169         if (lower_bounds[i].unsigned_p != unsigned_p
3170             || maybe_lt (lower_bounds[i].min_value, min_value))
3171           {
3172             lower_bounds[i].unsigned_p = unsigned_p;
3173             lower_bounds[i].min_value = min_value;
3174             if (dump_enabled_p ())
3175               {
3176                 dump_printf_loc (MSG_NOTE, vect_location,
3177                                  "updating run-time check to ");
3178                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3179                 dump_printf (MSG_NOTE, "\n");
3180               }
3181           }
3182         return;
3183       }
3184
3185   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3186   if (dump_enabled_p ())
3187     {
3188       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3189       dump_lower_bound (MSG_NOTE, lower_bound);
3190       dump_printf (MSG_NOTE, "\n");
3191     }
3192   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3193 }
3194
3195 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3196    will span fewer than GAP bytes.  */
3197
3198 static bool
3199 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3200                   poly_int64 gap)
3201 {
3202   stmt_vec_info stmt_info = dr_info->stmt;
3203   HOST_WIDE_INT count
3204     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3205   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3206     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3207   return (estimated_poly_value (gap)
3208           <= count * vect_get_scalar_dr_size (dr_info));
3209 }
3210
3211 /* Return true if we know that there is no alias between DR_INFO_A and
3212    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3213    When returning true, set *LOWER_BOUND_OUT to this N.  */
3214
3215 static bool
3216 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3217                                 poly_uint64 *lower_bound_out)
3218 {
3219   /* Check that there is a constant gap of known sign between DR_A
3220      and DR_B.  */
3221   data_reference *dr_a = dr_info_a->dr;
3222   data_reference *dr_b = dr_info_b->dr;
3223   poly_int64 init_a, init_b;
3224   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3225       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3226       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3227       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3228       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3229       || !ordered_p (init_a, init_b))
3230     return false;
3231
3232   /* Sort DR_A and DR_B by the address they access.  */
3233   if (maybe_lt (init_b, init_a))
3234     {
3235       std::swap (init_a, init_b);
3236       std::swap (dr_info_a, dr_info_b);
3237       std::swap (dr_a, dr_b);
3238     }
3239
3240   /* If the two accesses could be dependent within a scalar iteration,
3241      make sure that we'd retain their order.  */
3242   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3243       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3244     return false;
3245
3246   /* There is no alias if abs (DR_STEP) is greater than or equal to
3247      the bytes spanned by the combination of the two accesses.  */
3248   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3249   return true;
3250 }
3251
3252 /* Function vect_prune_runtime_alias_test_list.
3253
3254    Prune a list of ddrs to be tested at run-time by versioning for alias.
3255    Merge several alias checks into one if possible.
3256    Return FALSE if resulting list of ddrs is longer then allowed by
3257    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3258
3259 opt_result
3260 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3261 {
3262   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3263   hash_set <tree_pair_hash> compared_objects;
3264
3265   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3266   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3267     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3268   vec<vec_object_pair> &check_unequal_addrs
3269     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3270   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3271   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3272
3273   ddr_p ddr;
3274   unsigned int i;
3275   tree length_factor;
3276
3277   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3278
3279   /* Step values are irrelevant for aliasing if the number of vector
3280      iterations is equal to the number of scalar iterations (which can
3281      happen for fully-SLP loops).  */
3282   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3283
3284   if (!ignore_step_p)
3285     {
3286       /* Convert the checks for nonzero steps into bound tests.  */
3287       tree value;
3288       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3289         vect_check_lower_bound (loop_vinfo, value, true, 1);
3290     }
3291
3292   if (may_alias_ddrs.is_empty ())
3293     return opt_result::success ();
3294
3295   comp_alias_ddrs.create (may_alias_ddrs.length ());
3296
3297   unsigned int loop_depth
3298     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3299                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3300
3301   /* First, we collect all data ref pairs for aliasing checks.  */
3302   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3303     {
3304       int comp_res;
3305       poly_uint64 lower_bound;
3306       tree segment_length_a, segment_length_b;
3307       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3308       unsigned int align_a, align_b;
3309
3310       /* Ignore the alias if the VF we chose ended up being no greater
3311          than the dependence distance.  */
3312       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3313         continue;
3314
3315       if (DDR_OBJECT_A (ddr))
3316         {
3317           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3318           if (!compared_objects.add (new_pair))
3319             {
3320               if (dump_enabled_p ())
3321                 dump_printf_loc (MSG_NOTE, vect_location,
3322                                  "checking that %T and %T"
3323                                  " have different addresses\n",
3324                                  new_pair.first, new_pair.second);
3325               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3326             }
3327           continue;
3328         }
3329
3330       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3331       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3332
3333       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3334       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3335
3336       /* Skip the pair if inter-iteration dependencies are irrelevant
3337          and intra-iteration dependencies are guaranteed to be honored.  */
3338       if (ignore_step_p
3339           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3340               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3341                                                  &lower_bound)))
3342         {
3343           if (dump_enabled_p ())
3344             dump_printf_loc (MSG_NOTE, vect_location,
3345                              "no need for alias check between "
3346                              "%T and %T when VF is 1\n",
3347                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3348           continue;
3349         }
3350
3351       /* See whether we can handle the alias using a bounds check on
3352          the step, and whether that's likely to be the best approach.
3353          (It might not be, for example, if the minimum step is much larger
3354          than the number of bytes handled by one vector iteration.)  */
3355       if (!ignore_step_p
3356           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3357           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3358                                              &lower_bound)
3359           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3360               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3361         {
3362           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3363           if (dump_enabled_p ())
3364             {
3365               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3366                                "%T and %T when the step %T is outside ",
3367                                DR_REF (dr_info_a->dr),
3368                                DR_REF (dr_info_b->dr),
3369                                DR_STEP (dr_info_a->dr));
3370               if (unsigned_p)
3371                 dump_printf (MSG_NOTE, "[0");
3372               else
3373                 {
3374                   dump_printf (MSG_NOTE, "(");
3375                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3376                 }
3377               dump_printf (MSG_NOTE, ", ");
3378               dump_dec (MSG_NOTE, lower_bound);
3379               dump_printf (MSG_NOTE, ")\n");
3380             }
3381           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3382                                   unsigned_p, lower_bound);
3383           continue;
3384         }
3385
3386       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3387       if (dr_group_first_a)
3388         {
3389           stmt_info_a = dr_group_first_a;
3390           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3391         }
3392
3393       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3394       if (dr_group_first_b)
3395         {
3396           stmt_info_b = dr_group_first_b;
3397           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3398         }
3399
3400       if (ignore_step_p)
3401         {
3402           segment_length_a = size_zero_node;
3403           segment_length_b = size_zero_node;
3404         }
3405       else
3406         {
3407           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3408                                 DR_STEP (dr_info_b->dr), 0))
3409             length_factor = scalar_loop_iters;
3410           else
3411             length_factor = size_int (vect_factor);
3412           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3413           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3414         }
3415       access_size_a = vect_vfa_access_size (dr_info_a);
3416       access_size_b = vect_vfa_access_size (dr_info_b);
3417       align_a = vect_vfa_align (dr_info_a);
3418       align_b = vect_vfa_align (dr_info_b);
3419
3420       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3421                                         DR_BASE_ADDRESS (dr_info_b->dr));
3422       if (comp_res == 0)
3423         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3424                                           DR_OFFSET (dr_info_b->dr));
3425
3426       /* See whether the alias is known at compilation time.  */
3427       if (comp_res == 0
3428           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3429           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3430           && poly_int_tree_p (segment_length_a)
3431           && poly_int_tree_p (segment_length_b))
3432         {
3433           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3434                                              segment_length_a,
3435                                              segment_length_b,
3436                                              access_size_a,
3437                                              access_size_b);
3438           if (res >= 0 && dump_enabled_p ())
3439             {
3440               dump_printf_loc (MSG_NOTE, vect_location,
3441                                "can tell at compile time that %T and %T",
3442                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3443               if (res == 0)
3444                 dump_printf (MSG_NOTE, " do not alias\n");
3445               else
3446                 dump_printf (MSG_NOTE, " alias\n");
3447             }
3448
3449           if (res == 0)
3450             continue;
3451
3452           if (res == 1)
3453             return opt_result::failure_at (stmt_info_b->stmt,
3454                                            "not vectorized:"
3455                                            " compilation time alias: %G%G",
3456                                            stmt_info_a->stmt,
3457                                            stmt_info_b->stmt);
3458         }
3459
3460       dr_with_seg_len_pair_t dr_with_seg_len_pair
3461         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3462                           access_size_a, align_a),
3463          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3464                           access_size_b, align_b));
3465
3466       /* Canonicalize pairs by sorting the two DR members.  */
3467       if (comp_res > 0)
3468         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3469
3470       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3471     }
3472
3473   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3474
3475   unsigned int count = (comp_alias_ddrs.length ()
3476                         + check_unequal_addrs.length ());
3477
3478   dump_printf_loc (MSG_NOTE, vect_location,
3479                    "improved number of alias checks from %d to %d\n",
3480                    may_alias_ddrs.length (), count);
3481   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3482     return opt_result::failure_at
3483       (vect_location,
3484        "number of versioning for alias "
3485        "run-time tests exceeds %d "
3486        "(--param vect-max-version-for-alias-checks)\n",
3487        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3488
3489   return opt_result::success ();
3490 }
3491
3492 /* Check whether we can use an internal function for a gather load
3493    or scatter store.  READ_P is true for loads and false for stores.
3494    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3495    the type of the memory elements being loaded or stored.  OFFSET_BITS
3496    is the number of bits in each scalar offset and OFFSET_SIGN is the
3497    sign of the offset.  SCALE is the amount by which the offset should
3498    be multiplied *after* it has been converted to address width.
3499
3500    Return true if the function is supported, storing the function
3501    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3502
3503 bool
3504 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3505                           tree memory_type, unsigned int offset_bits,
3506                           signop offset_sign, int scale,
3507                           internal_fn *ifn_out, tree *element_type_out)
3508 {
3509   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3510   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3511   if (offset_bits > element_bits)
3512     /* Internal functions require the offset to be the same width as
3513        the vector elements.  We can extend narrower offsets, but it isn't
3514        safe to truncate wider offsets.  */
3515     return false;
3516
3517   if (element_bits != memory_bits)
3518     /* For now the vector elements must be the same width as the
3519        memory elements.  */
3520     return false;
3521
3522   /* Work out which function we need.  */
3523   internal_fn ifn;
3524   if (read_p)
3525     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3526   else
3527     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3528
3529   /* Test whether the target supports this combination.  */
3530   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3531                                                offset_sign, scale))
3532     return false;
3533
3534   *ifn_out = ifn;
3535   *element_type_out = TREE_TYPE (vectype);
3536   return true;
3537 }
3538
3539 /* STMT_INFO is a call to an internal gather load or scatter store function.
3540    Describe the operation in INFO.  */
3541
3542 static void
3543 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3544                                    gather_scatter_info *info)
3545 {
3546   gcall *call = as_a <gcall *> (stmt_info->stmt);
3547   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3548   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3549
3550   info->ifn = gimple_call_internal_fn (call);
3551   info->decl = NULL_TREE;
3552   info->base = gimple_call_arg (call, 0);
3553   info->offset = gimple_call_arg (call, 1);
3554   info->offset_dt = vect_unknown_def_type;
3555   info->offset_vectype = NULL_TREE;
3556   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3557   info->element_type = TREE_TYPE (vectype);
3558   info->memory_type = TREE_TYPE (DR_REF (dr));
3559 }
3560
3561 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3562    gather load or scatter store.  Describe the operation in *INFO if so.  */
3563
3564 bool
3565 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3566                            gather_scatter_info *info)
3567 {
3568   HOST_WIDE_INT scale = 1;
3569   poly_int64 pbitpos, pbitsize;
3570   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3571   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3572   tree offtype = NULL_TREE;
3573   tree decl = NULL_TREE, base, off;
3574   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3575   tree memory_type = TREE_TYPE (DR_REF (dr));
3576   machine_mode pmode;
3577   int punsignedp, reversep, pvolatilep = 0;
3578   internal_fn ifn;
3579   tree element_type;
3580   bool masked_p = false;
3581
3582   /* See whether this is already a call to a gather/scatter internal function.
3583      If not, see whether it's a masked load or store.  */
3584   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3585   if (call && gimple_call_internal_p (call))
3586     {
3587       ifn = gimple_call_internal_fn (call);
3588       if (internal_gather_scatter_fn_p (ifn))
3589         {
3590           vect_describe_gather_scatter_call (stmt_info, info);
3591           return true;
3592         }
3593       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3594     }
3595
3596   /* True if we should aim to use internal functions rather than
3597      built-in functions.  */
3598   bool use_ifn_p = (DR_IS_READ (dr)
3599                     ? supports_vec_gather_load_p ()
3600                     : supports_vec_scatter_store_p ());
3601
3602   base = DR_REF (dr);
3603   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3604      see if we can use the def stmt of the address.  */
3605   if (masked_p
3606       && TREE_CODE (base) == MEM_REF
3607       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3608       && integer_zerop (TREE_OPERAND (base, 1))
3609       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3610     {
3611       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3612       if (is_gimple_assign (def_stmt)
3613           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3614         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3615     }
3616
3617   /* The gather and scatter builtins need address of the form
3618      loop_invariant + vector * {1, 2, 4, 8}
3619      or
3620      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3621      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3622      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3623      multiplications and additions in it.  To get a vector, we need
3624      a single SSA_NAME that will be defined in the loop and will
3625      contain everything that is not loop invariant and that can be
3626      vectorized.  The following code attempts to find such a preexistng
3627      SSA_NAME OFF and put the loop invariants into a tree BASE
3628      that can be gimplified before the loop.  */
3629   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3630                               &punsignedp, &reversep, &pvolatilep);
3631   if (reversep)
3632     return false;
3633
3634   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3635
3636   if (TREE_CODE (base) == MEM_REF)
3637     {
3638       if (!integer_zerop (TREE_OPERAND (base, 1)))
3639         {
3640           if (off == NULL_TREE)
3641             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3642           else
3643             off = size_binop (PLUS_EXPR, off,
3644                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3645         }
3646       base = TREE_OPERAND (base, 0);
3647     }
3648   else
3649     base = build_fold_addr_expr (base);
3650
3651   if (off == NULL_TREE)
3652     off = size_zero_node;
3653
3654   /* If base is not loop invariant, either off is 0, then we start with just
3655      the constant offset in the loop invariant BASE and continue with base
3656      as OFF, otherwise give up.
3657      We could handle that case by gimplifying the addition of base + off
3658      into some SSA_NAME and use that as off, but for now punt.  */
3659   if (!expr_invariant_in_loop_p (loop, base))
3660     {
3661       if (!integer_zerop (off))
3662         return false;
3663       off = base;
3664       base = size_int (pbytepos);
3665     }
3666   /* Otherwise put base + constant offset into the loop invariant BASE
3667      and continue with OFF.  */
3668   else
3669     {
3670       base = fold_convert (sizetype, base);
3671       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3672     }
3673
3674   /* OFF at this point may be either a SSA_NAME or some tree expression
3675      from get_inner_reference.  Try to peel off loop invariants from it
3676      into BASE as long as possible.  */
3677   STRIP_NOPS (off);
3678   while (offtype == NULL_TREE)
3679     {
3680       enum tree_code code;
3681       tree op0, op1, add = NULL_TREE;
3682
3683       if (TREE_CODE (off) == SSA_NAME)
3684         {
3685           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3686
3687           if (expr_invariant_in_loop_p (loop, off))
3688             return false;
3689
3690           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3691             break;
3692
3693           op0 = gimple_assign_rhs1 (def_stmt);
3694           code = gimple_assign_rhs_code (def_stmt);
3695           op1 = gimple_assign_rhs2 (def_stmt);
3696         }
3697       else
3698         {
3699           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3700             return false;
3701           code = TREE_CODE (off);
3702           extract_ops_from_tree (off, &code, &op0, &op1);
3703         }
3704       switch (code)
3705         {
3706         case POINTER_PLUS_EXPR:
3707         case PLUS_EXPR:
3708           if (expr_invariant_in_loop_p (loop, op0))
3709             {
3710               add = op0;
3711               off = op1;
3712             do_add:
3713               add = fold_convert (sizetype, add);
3714               if (scale != 1)
3715                 add = size_binop (MULT_EXPR, add, size_int (scale));
3716               base = size_binop (PLUS_EXPR, base, add);
3717               continue;
3718             }
3719           if (expr_invariant_in_loop_p (loop, op1))
3720             {
3721               add = op1;
3722               off = op0;
3723               goto do_add;
3724             }
3725           break;
3726         case MINUS_EXPR:
3727           if (expr_invariant_in_loop_p (loop, op1))
3728             {
3729               add = fold_convert (sizetype, op1);
3730               add = size_binop (MINUS_EXPR, size_zero_node, add);
3731               off = op0;
3732               goto do_add;
3733             }
3734           break;
3735         case MULT_EXPR:
3736           if (scale == 1 && tree_fits_shwi_p (op1))
3737             {
3738               int new_scale = tree_to_shwi (op1);
3739               /* Only treat this as a scaling operation if the target
3740                  supports it.  */
3741               if (use_ifn_p
3742                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3743                                                 vectype, memory_type, 1,
3744                                                 TYPE_SIGN (TREE_TYPE (op0)),
3745                                                 new_scale, &ifn,
3746                                                 &element_type))
3747                 break;
3748               scale = new_scale;
3749               off = op0;
3750               continue;
3751             }
3752           break;
3753         case SSA_NAME:
3754           off = op0;
3755           continue;
3756         CASE_CONVERT:
3757           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3758               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3759             break;
3760           if (TYPE_PRECISION (TREE_TYPE (op0))
3761               == TYPE_PRECISION (TREE_TYPE (off)))
3762             {
3763               off = op0;
3764               continue;
3765             }
3766
3767           /* The internal functions need the offset to be the same width
3768              as the elements of VECTYPE.  Don't include operations that
3769              cast the offset from that width to a different width.  */
3770           if (use_ifn_p
3771               && (int_size_in_bytes (TREE_TYPE (vectype))
3772                   == int_size_in_bytes (TREE_TYPE (off))))
3773             break;
3774
3775           if (TYPE_PRECISION (TREE_TYPE (op0))
3776               < TYPE_PRECISION (TREE_TYPE (off)))
3777             {
3778               off = op0;
3779               offtype = TREE_TYPE (off);
3780               STRIP_NOPS (off);
3781               continue;
3782             }
3783           break;
3784         default:
3785           break;
3786         }
3787       break;
3788     }
3789
3790   /* If at the end OFF still isn't a SSA_NAME or isn't
3791      defined in the loop, punt.  */
3792   if (TREE_CODE (off) != SSA_NAME
3793       || expr_invariant_in_loop_p (loop, off))
3794     return false;
3795
3796   if (offtype == NULL_TREE)
3797     offtype = TREE_TYPE (off);
3798
3799   if (use_ifn_p)
3800     {
3801       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3802                                      memory_type, TYPE_PRECISION (offtype),
3803                                      TYPE_SIGN (offtype), scale, &ifn,
3804                                      &element_type))
3805         return false;
3806     }
3807   else
3808     {
3809       if (DR_IS_READ (dr))
3810         {
3811           if (targetm.vectorize.builtin_gather)
3812             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3813         }
3814       else
3815         {
3816           if (targetm.vectorize.builtin_scatter)
3817             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3818         }
3819
3820       if (!decl)
3821         return false;
3822
3823       ifn = IFN_LAST;
3824       element_type = TREE_TYPE (vectype);
3825     }
3826
3827   info->ifn = ifn;
3828   info->decl = decl;
3829   info->base = base;
3830   info->offset = off;
3831   info->offset_dt = vect_unknown_def_type;
3832   info->offset_vectype = NULL_TREE;
3833   info->scale = scale;
3834   info->element_type = element_type;
3835   info->memory_type = memory_type;
3836   return true;
3837 }
3838
3839 /* Find the data references in STMT, analyze them with respect to LOOP and
3840    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3841    be handled.  */
3842
3843 opt_result
3844 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3845                                vec<data_reference_p> *datarefs)
3846 {
3847   /* We can ignore clobbers for dataref analysis - they are removed during
3848      loop vectorization and BB vectorization checks dependences with a
3849      stmt walk.  */
3850   if (gimple_clobber_p (stmt))
3851     return opt_result::success ();
3852
3853   if (gimple_has_volatile_ops (stmt))
3854     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3855                                    stmt);
3856
3857   if (stmt_can_throw_internal (cfun, stmt))
3858     return opt_result::failure_at (stmt,
3859                                    "not vectorized:"
3860                                    " statement can throw an exception: %G",
3861                                    stmt);
3862
3863   auto_vec<data_reference_p, 2> refs;
3864   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3865   if (!res)
3866     return res;
3867
3868   if (refs.is_empty ())
3869     return opt_result::success ();
3870
3871   if (refs.length () > 1)
3872     return opt_result::failure_at (stmt,
3873                                    "not vectorized:"
3874                                    " more than one data ref in stmt: %G", stmt);
3875
3876   if (gcall *call = dyn_cast <gcall *> (stmt))
3877     if (!gimple_call_internal_p (call)
3878         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3879             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3880       return opt_result::failure_at (stmt,
3881                                      "not vectorized: dr in a call %G", stmt);
3882
3883   data_reference_p dr = refs.pop ();
3884   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3885       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3886     return opt_result::failure_at (stmt,
3887                                    "not vectorized:"
3888                                    " statement is bitfield access %G", stmt);
3889
3890   if (DR_BASE_ADDRESS (dr)
3891       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3892     return opt_result::failure_at (stmt,
3893                                    "not vectorized:"
3894                                    " base addr of dr is a constant\n");
3895
3896   /* Check whether this may be a SIMD lane access and adjust the
3897      DR to make it easier for us to handle it.  */
3898   if (loop
3899       && loop->simduid
3900       && (!DR_BASE_ADDRESS (dr)
3901           || !DR_OFFSET (dr)
3902           || !DR_INIT (dr)
3903           || !DR_STEP (dr)))
3904     {
3905       struct data_reference *newdr
3906         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3907                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3908       if (DR_BASE_ADDRESS (newdr)
3909           && DR_OFFSET (newdr)
3910           && DR_INIT (newdr)
3911           && DR_STEP (newdr)
3912           && integer_zerop (DR_STEP (newdr)))
3913         {
3914           tree off = DR_OFFSET (newdr);
3915           STRIP_NOPS (off);
3916           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3917               && TREE_CODE (off) == MULT_EXPR
3918               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3919             {
3920               tree step = TREE_OPERAND (off, 1);
3921               off = TREE_OPERAND (off, 0);
3922               STRIP_NOPS (off);
3923               if (CONVERT_EXPR_P (off)
3924                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3925                       < TYPE_PRECISION (TREE_TYPE (off))))
3926                 off = TREE_OPERAND (off, 0);
3927               if (TREE_CODE (off) == SSA_NAME)
3928                 {
3929                   gimple *def = SSA_NAME_DEF_STMT (off);
3930                   tree reft = TREE_TYPE (DR_REF (newdr));
3931                   if (is_gimple_call (def)
3932                       && gimple_call_internal_p (def)
3933                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3934                     {
3935                       tree arg = gimple_call_arg (def, 0);
3936                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
3937                       arg = SSA_NAME_VAR (arg);
3938                       if (arg == loop->simduid
3939                           /* For now.  */
3940                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3941                         {
3942                           DR_OFFSET (newdr) = ssize_int (0);
3943                           DR_STEP (newdr) = step;
3944                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3945                           DR_STEP_ALIGNMENT (newdr)
3946                             = highest_pow2_factor (step);
3947                           /* Mark as simd-lane access.  */
3948                           newdr->aux = (void *)-1;
3949                           free_data_ref (dr);
3950                           datarefs->safe_push (newdr);
3951                           return opt_result::success ();
3952                         }
3953                     }
3954                 }
3955             }
3956         }
3957       free_data_ref (newdr);
3958     }
3959
3960   datarefs->safe_push (dr);
3961   return opt_result::success ();
3962 }
3963
3964 /* Function vect_analyze_data_refs.
3965
3966   Find all the data references in the loop or basic block.
3967
3968    The general structure of the analysis of data refs in the vectorizer is as
3969    follows:
3970    1- vect_analyze_data_refs(loop/bb): call
3971       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3972       in the loop/bb and their dependences.
3973    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3974    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3975    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3976
3977 */
3978
3979 opt_result
3980 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3981 {
3982   struct loop *loop = NULL;
3983   unsigned int i;
3984   struct data_reference *dr;
3985   tree scalar_type;
3986
3987   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
3988
3989   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3990     loop = LOOP_VINFO_LOOP (loop_vinfo);
3991
3992   /* Go through the data-refs, check that the analysis succeeded.  Update
3993      pointer from stmt_vec_info struct to DR and vectype.  */
3994
3995   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3996   FOR_EACH_VEC_ELT (datarefs, i, dr)
3997     {
3998       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3999       poly_uint64 vf;
4000
4001       gcc_assert (DR_REF (dr));
4002       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4003       gcc_assert (!stmt_info->dr_aux.dr);
4004       stmt_info->dr_aux.dr = dr;
4005       stmt_info->dr_aux.stmt = stmt_info;
4006
4007       /* Check that analysis of the data-ref succeeded.  */
4008       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4009           || !DR_STEP (dr))
4010         {
4011           bool maybe_gather
4012             = DR_IS_READ (dr)
4013               && !TREE_THIS_VOLATILE (DR_REF (dr))
4014               && (targetm.vectorize.builtin_gather != NULL
4015                   || supports_vec_gather_load_p ());
4016           bool maybe_scatter
4017             = DR_IS_WRITE (dr)
4018               && !TREE_THIS_VOLATILE (DR_REF (dr))
4019               && (targetm.vectorize.builtin_scatter != NULL
4020                   || supports_vec_scatter_store_p ());
4021
4022           /* If target supports vector gather loads or scatter stores,
4023              see if they can't be used.  */
4024           if (is_a <loop_vec_info> (vinfo)
4025               && !nested_in_vect_loop_p (loop, stmt_info))
4026             {
4027               if (maybe_gather || maybe_scatter)
4028                 {
4029                   if (maybe_gather)
4030                     gatherscatter = GATHER;
4031                   else
4032                     gatherscatter = SCATTER;
4033                 }
4034             }
4035
4036           if (gatherscatter == SG_NONE)
4037             {
4038               if (dump_enabled_p ())
4039                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4040                                  "not vectorized: data ref analysis "
4041                                  "failed %G", stmt_info->stmt);
4042               if (is_a <bb_vec_info> (vinfo))
4043                 {
4044                   /* In BB vectorization the ref can still participate
4045                      in dependence analysis, we just can't vectorize it.  */
4046                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4047                   continue;
4048                 }
4049               return opt_result::failure_at (stmt_info->stmt,
4050                                              "not vectorized:"
4051                                              " data ref analysis failed: %G",
4052                                              stmt_info->stmt);
4053             }
4054         }
4055
4056       /* See if this was detected as SIMD lane access.  */
4057       if (dr->aux == (void *)-1)
4058         {
4059           if (nested_in_vect_loop_p (loop, stmt_info))
4060             return opt_result::failure_at (stmt_info->stmt,
4061                                            "not vectorized:"
4062                                            " data ref analysis failed: %G",
4063                                            stmt_info->stmt);
4064           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4065         }
4066
4067       tree base = get_base_address (DR_REF (dr));
4068       if (base && VAR_P (base) && DECL_NONALIASED (base))
4069         {
4070           if (dump_enabled_p ())
4071             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4072                              "not vectorized: base object not addressable "
4073                              "for stmt: %G", stmt_info->stmt);
4074           if (is_a <bb_vec_info> (vinfo))
4075             {
4076               /* In BB vectorization the ref can still participate
4077                  in dependence analysis, we just can't vectorize it.  */
4078               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4079               continue;
4080             }
4081           return opt_result::failure_at (stmt_info->stmt,
4082                                          "not vectorized: base object not"
4083                                          " addressable for stmt: %G",
4084                                          stmt_info->stmt);
4085         }
4086
4087       if (is_a <loop_vec_info> (vinfo)
4088           && DR_STEP (dr)
4089           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4090         {
4091           if (nested_in_vect_loop_p (loop, stmt_info))
4092             return opt_result::failure_at (stmt_info->stmt,
4093                                            "not vectorized:"
4094                                            "not suitable for strided load %G",
4095                                            stmt_info->stmt);
4096           STMT_VINFO_STRIDED_P (stmt_info) = true;
4097         }
4098
4099       /* Update DR field in stmt_vec_info struct.  */
4100
4101       /* If the dataref is in an inner-loop of the loop that is considered for
4102          for vectorization, we also want to analyze the access relative to
4103          the outer-loop (DR contains information only relative to the
4104          inner-most enclosing loop).  We do that by building a reference to the
4105          first location accessed by the inner-loop, and analyze it relative to
4106          the outer-loop.  */
4107       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4108         {
4109           /* Build a reference to the first location accessed by the
4110              inner loop: *(BASE + INIT + OFFSET).  By construction,
4111              this address must be invariant in the inner loop, so we
4112              can consider it as being used in the outer loop.  */
4113           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4114           tree offset = unshare_expr (DR_OFFSET (dr));
4115           tree init = unshare_expr (DR_INIT (dr));
4116           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4117                                           init, offset);
4118           tree init_addr = fold_build_pointer_plus (base, init_offset);
4119           tree init_ref = build_fold_indirect_ref (init_addr);
4120
4121           if (dump_enabled_p ())
4122             dump_printf_loc (MSG_NOTE, vect_location,
4123                              "analyze in outer loop: %T\n", init_ref);
4124
4125           opt_result res
4126             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4127                                     init_ref, loop, stmt_info->stmt);
4128           if (!res)
4129             /* dr_analyze_innermost already explained the failure.  */
4130             return res;
4131
4132           if (dump_enabled_p ())
4133             dump_printf_loc (MSG_NOTE, vect_location,
4134                              "\touter base_address: %T\n"
4135                              "\touter offset from base address: %T\n"
4136                              "\touter constant offset from base address: %T\n"
4137                              "\touter step: %T\n"
4138                              "\touter base alignment: %d\n\n"
4139                              "\touter base misalignment: %d\n"
4140                              "\touter offset alignment: %d\n"
4141                              "\touter step alignment: %d\n",
4142                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4143                              STMT_VINFO_DR_OFFSET (stmt_info),
4144                              STMT_VINFO_DR_INIT (stmt_info),
4145                              STMT_VINFO_DR_STEP (stmt_info),
4146                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4147                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4148                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4149                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4150         }
4151
4152       /* Set vectype for STMT.  */
4153       scalar_type = TREE_TYPE (DR_REF (dr));
4154       STMT_VINFO_VECTYPE (stmt_info)
4155         = get_vectype_for_scalar_type (scalar_type);
4156       if (!STMT_VINFO_VECTYPE (stmt_info))
4157         {
4158           if (dump_enabled_p ())
4159             {
4160               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4161                                "not vectorized: no vectype for stmt: %G",
4162                                stmt_info->stmt);
4163               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4164               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4165                                  scalar_type);
4166               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4167             }
4168
4169           if (is_a <bb_vec_info> (vinfo))
4170             {
4171               /* No vector type is fine, the ref can still participate
4172                  in dependence analysis, we just can't vectorize it.  */
4173               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4174               continue;
4175             }
4176           return opt_result::failure_at (stmt_info->stmt,
4177                                          "not vectorized:"
4178                                          " no vectype for stmt: %G"
4179                                          " scalar_type: %T\n",
4180                                          stmt_info->stmt, scalar_type);
4181         }
4182       else
4183         {
4184           if (dump_enabled_p ())
4185             dump_printf_loc (MSG_NOTE, vect_location,
4186                              "got vectype for stmt: %G%T\n",
4187                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4188         }
4189
4190       /* Adjust the minimal vectorization factor according to the
4191          vector type.  */
4192       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4193       *min_vf = upper_bound (*min_vf, vf);
4194
4195       if (gatherscatter != SG_NONE)
4196         {
4197           gather_scatter_info gs_info;
4198           if (!vect_check_gather_scatter (stmt_info,
4199                                           as_a <loop_vec_info> (vinfo),
4200                                           &gs_info)
4201               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4202             return opt_result::failure_at
4203               (stmt_info->stmt,
4204                (gatherscatter == GATHER) ?
4205                "not vectorized: not suitable for gather load %G" :
4206                "not vectorized: not suitable for scatter store %G",
4207                stmt_info->stmt);
4208           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4209         }
4210     }
4211
4212   /* We used to stop processing and prune the list here.  Verify we no
4213      longer need to.  */
4214   gcc_assert (i == datarefs.length ());
4215
4216   return opt_result::success ();
4217 }
4218
4219
4220 /* Function vect_get_new_vect_var.
4221
4222    Returns a name for a new variable.  The current naming scheme appends the
4223    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4224    the name of vectorizer generated variables, and appends that to NAME if
4225    provided.  */
4226
4227 tree
4228 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4229 {
4230   const char *prefix;
4231   tree new_vect_var;
4232
4233   switch (var_kind)
4234   {
4235   case vect_simple_var:
4236     prefix = "vect";
4237     break;
4238   case vect_scalar_var:
4239     prefix = "stmp";
4240     break;
4241   case vect_mask_var:
4242     prefix = "mask";
4243     break;
4244   case vect_pointer_var:
4245     prefix = "vectp";
4246     break;
4247   default:
4248     gcc_unreachable ();
4249   }
4250
4251   if (name)
4252     {
4253       char* tmp = concat (prefix, "_", name, NULL);
4254       new_vect_var = create_tmp_reg (type, tmp);
4255       free (tmp);
4256     }
4257   else
4258     new_vect_var = create_tmp_reg (type, prefix);
4259
4260   return new_vect_var;
4261 }
4262
4263 /* Like vect_get_new_vect_var but return an SSA name.  */
4264
4265 tree
4266 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4267 {
4268   const char *prefix;
4269   tree new_vect_var;
4270
4271   switch (var_kind)
4272   {
4273   case vect_simple_var:
4274     prefix = "vect";
4275     break;
4276   case vect_scalar_var:
4277     prefix = "stmp";
4278     break;
4279   case vect_pointer_var:
4280     prefix = "vectp";
4281     break;
4282   default:
4283     gcc_unreachable ();
4284   }
4285
4286   if (name)
4287     {
4288       char* tmp = concat (prefix, "_", name, NULL);
4289       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4290       free (tmp);
4291     }
4292   else
4293     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4294
4295   return new_vect_var;
4296 }
4297
4298 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4299
4300 static void
4301 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4302 {
4303   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4304   int misalign = DR_MISALIGNMENT (dr_info);
4305   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4306     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4307   else
4308     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4309                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4310 }
4311
4312 /* Function vect_create_addr_base_for_vector_ref.
4313
4314    Create an expression that computes the address of the first memory location
4315    that will be accessed for a data reference.
4316
4317    Input:
4318    STMT_INFO: The statement containing the data reference.
4319    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4320    OFFSET: Optional. If supplied, it is be added to the initial address.
4321    LOOP:    Specify relative to which loop-nest should the address be computed.
4322             For example, when the dataref is in an inner-loop nested in an
4323             outer-loop that is now being vectorized, LOOP can be either the
4324             outer-loop, or the inner-loop.  The first memory location accessed
4325             by the following dataref ('in' points to short):
4326
4327                 for (i=0; i<N; i++)
4328                    for (j=0; j<M; j++)
4329                      s += in[i+j]
4330
4331             is as follows:
4332             if LOOP=i_loop:     &in             (relative to i_loop)
4333             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4334    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4335             initial address.  Unlike OFFSET, which is number of elements to
4336             be added, BYTE_OFFSET is measured in bytes.
4337
4338    Output:
4339    1. Return an SSA_NAME whose value is the address of the memory location of
4340       the first vector of the data reference.
4341    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4342       these statement(s) which define the returned SSA_NAME.
4343
4344    FORNOW: We are only handling array accesses with step 1.  */
4345
4346 tree
4347 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4348                                       gimple_seq *new_stmt_list,
4349                                       tree offset,
4350                                       tree byte_offset)
4351 {
4352   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4353   struct data_reference *dr = dr_info->dr;
4354   const char *base_name;
4355   tree addr_base;
4356   tree dest;
4357   gimple_seq seq = NULL;
4358   tree vect_ptr_type;
4359   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4360   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4361   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4362
4363   tree data_ref_base = unshare_expr (drb->base_address);
4364   tree base_offset = unshare_expr (drb->offset);
4365   tree init = unshare_expr (drb->init);
4366
4367   if (loop_vinfo)
4368     base_name = get_name (data_ref_base);
4369   else
4370     {
4371       base_offset = ssize_int (0);
4372       init = ssize_int (0);
4373       base_name = get_name (DR_REF (dr));
4374     }
4375
4376   /* Create base_offset */
4377   base_offset = size_binop (PLUS_EXPR,
4378                             fold_convert (sizetype, base_offset),
4379                             fold_convert (sizetype, init));
4380
4381   if (offset)
4382     {
4383       offset = fold_build2 (MULT_EXPR, sizetype,
4384                             fold_convert (sizetype, offset), step);
4385       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4386                                  base_offset, offset);
4387     }
4388   if (byte_offset)
4389     {
4390       byte_offset = fold_convert (sizetype, byte_offset);
4391       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4392                                  base_offset, byte_offset);
4393     }
4394
4395   /* base + base_offset */
4396   if (loop_vinfo)
4397     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4398   else
4399     {
4400       addr_base = build1 (ADDR_EXPR,
4401                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4402                           unshare_expr (DR_REF (dr)));
4403     }
4404
4405   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4406   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4407   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4408   gimple_seq_add_seq (new_stmt_list, seq);
4409
4410   if (DR_PTR_INFO (dr)
4411       && TREE_CODE (addr_base) == SSA_NAME
4412       && !SSA_NAME_PTR_INFO (addr_base))
4413     {
4414       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4415       if (offset || byte_offset)
4416         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4417     }
4418
4419   if (dump_enabled_p ())
4420     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4421
4422   return addr_base;
4423 }
4424
4425
4426 /* Function vect_create_data_ref_ptr.
4427
4428    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4429    location accessed in the loop by STMT_INFO, along with the def-use update
4430    chain to appropriately advance the pointer through the loop iterations.
4431    Also set aliasing information for the pointer.  This pointer is used by
4432    the callers to this function to create a memory reference expression for
4433    vector load/store access.
4434
4435    Input:
4436    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4437          GIMPLE_ASSIGN <name, data-ref> or
4438          GIMPLE_ASSIGN <data-ref, name>.
4439    2. AGGR_TYPE: the type of the reference, which should be either a vector
4440         or an array.
4441    3. AT_LOOP: the loop where the vector memref is to be created.
4442    4. OFFSET (optional): an offset to be added to the initial address accessed
4443         by the data-ref in STMT_INFO.
4444    5. BSI: location where the new stmts are to be placed if there is no loop
4445    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4446         pointing to the initial address.
4447    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4448         to the initial address accessed by the data-ref in STMT_INFO.  This is
4449         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4450         in bytes.
4451    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4452         to the IV during each iteration of the loop.  NULL says to move
4453         by one copy of AGGR_TYPE up or down, depending on the step of the
4454         data reference.
4455
4456    Output:
4457    1. Declare a new ptr to vector_type, and have it point to the base of the
4458       data reference (initial addressed accessed by the data reference).
4459       For example, for vector of type V8HI, the following code is generated:
4460
4461       v8hi *ap;
4462       ap = (v8hi *)initial_address;
4463
4464       if OFFSET is not supplied:
4465          initial_address = &a[init];
4466       if OFFSET is supplied:
4467          initial_address = &a[init + OFFSET];
4468       if BYTE_OFFSET is supplied:
4469          initial_address = &a[init] + BYTE_OFFSET;
4470
4471       Return the initial_address in INITIAL_ADDRESS.
4472
4473    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4474       update the pointer in each iteration of the loop.
4475
4476       Return the increment stmt that updates the pointer in PTR_INCR.
4477
4478    3. Return the pointer.  */
4479
4480 tree
4481 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4482                           struct loop *at_loop, tree offset,
4483                           tree *initial_address, gimple_stmt_iterator *gsi,
4484                           gimple **ptr_incr, bool only_init,
4485                           tree byte_offset, tree iv_step)
4486 {
4487   const char *base_name;
4488   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4489   struct loop *loop = NULL;
4490   bool nested_in_vect_loop = false;
4491   struct loop *containing_loop = NULL;
4492   tree aggr_ptr_type;
4493   tree aggr_ptr;
4494   tree new_temp;
4495   gimple_seq new_stmt_list = NULL;
4496   edge pe = NULL;
4497   basic_block new_bb;
4498   tree aggr_ptr_init;
4499   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4500   struct data_reference *dr = dr_info->dr;
4501   tree aptr;
4502   gimple_stmt_iterator incr_gsi;
4503   bool insert_after;
4504   tree indx_before_incr, indx_after_incr;
4505   gimple *incr;
4506   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4507
4508   gcc_assert (iv_step != NULL_TREE
4509               || TREE_CODE (aggr_type) == ARRAY_TYPE
4510               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4511
4512   if (loop_vinfo)
4513     {
4514       loop = LOOP_VINFO_LOOP (loop_vinfo);
4515       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4516       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4517       pe = loop_preheader_edge (loop);
4518     }
4519   else
4520     {
4521       gcc_assert (bb_vinfo);
4522       only_init = true;
4523       *ptr_incr = NULL;
4524     }
4525
4526   /* Create an expression for the first address accessed by this load
4527      in LOOP.  */
4528   base_name = get_name (DR_BASE_ADDRESS (dr));
4529
4530   if (dump_enabled_p ())
4531     {
4532       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4533       dump_printf_loc (MSG_NOTE, vect_location,
4534                        "create %s-pointer variable to type: %T",
4535                        get_tree_code_name (TREE_CODE (aggr_type)),
4536                        aggr_type);
4537       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4538         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4539       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4540         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4541       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4542         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4543       else
4544         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4545       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4546     }
4547
4548   /* (1) Create the new aggregate-pointer variable.
4549      Vector and array types inherit the alias set of their component
4550      type by default so we need to use a ref-all pointer if the data
4551      reference does not conflict with the created aggregated data
4552      reference because it is not addressable.  */
4553   bool need_ref_all = false;
4554   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4555                               get_alias_set (DR_REF (dr))))
4556     need_ref_all = true;
4557   /* Likewise for any of the data references in the stmt group.  */
4558   else if (DR_GROUP_SIZE (stmt_info) > 1)
4559     {
4560       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4561       do
4562         {
4563           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4564           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4565                                       get_alias_set (DR_REF (sdr))))
4566             {
4567               need_ref_all = true;
4568               break;
4569             }
4570           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4571         }
4572       while (sinfo);
4573     }
4574   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4575                                                need_ref_all);
4576   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4577
4578
4579   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4580      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4581      def-use update cycles for the pointer: one relative to the outer-loop
4582      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4583      to the inner-loop (which is the inner-most loop containing the dataref),
4584      and this is done be step (5) below.
4585
4586      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4587      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4588      redundant.  Steps (3),(4) create the following:
4589
4590         vp0 = &base_addr;
4591         LOOP:   vp1 = phi(vp0,vp2)
4592                 ...
4593                 ...
4594                 vp2 = vp1 + step
4595                 goto LOOP
4596
4597      If there is an inner-loop nested in loop, then step (5) will also be
4598      applied, and an additional update in the inner-loop will be created:
4599
4600         vp0 = &base_addr;
4601         LOOP:   vp1 = phi(vp0,vp2)
4602                 ...
4603         inner:     vp3 = phi(vp1,vp4)
4604                    vp4 = vp3 + inner_step
4605                    if () goto inner
4606                 ...
4607                 vp2 = vp1 + step
4608                 if () goto LOOP   */
4609
4610   /* (2) Calculate the initial address of the aggregate-pointer, and set
4611      the aggregate-pointer to point to it before the loop.  */
4612
4613   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4614
4615   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4616                                                    offset, byte_offset);
4617   if (new_stmt_list)
4618     {
4619       if (pe)
4620         {
4621           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4622           gcc_assert (!new_bb);
4623         }
4624       else
4625         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4626     }
4627
4628   *initial_address = new_temp;
4629   aggr_ptr_init = new_temp;
4630
4631   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4632      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4633      inner-loop nested in LOOP (during outer-loop vectorization).  */
4634
4635   /* No update in loop is required.  */
4636   if (only_init && (!loop_vinfo || at_loop == loop))
4637     aptr = aggr_ptr_init;
4638   else
4639     {
4640       /* Accesses to invariant addresses should be handled specially
4641          by the caller.  */
4642       tree step = vect_dr_behavior (dr_info)->step;
4643       gcc_assert (!integer_zerop (step));
4644
4645       if (iv_step == NULL_TREE)
4646         {
4647           /* The step of the aggregate pointer is the type size,
4648              negated for downward accesses.  */
4649           iv_step = TYPE_SIZE_UNIT (aggr_type);
4650           if (tree_int_cst_sgn (step) == -1)
4651             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4652         }
4653
4654       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4655
4656       create_iv (aggr_ptr_init,
4657                  fold_convert (aggr_ptr_type, iv_step),
4658                  aggr_ptr, loop, &incr_gsi, insert_after,
4659                  &indx_before_incr, &indx_after_incr);
4660       incr = gsi_stmt (incr_gsi);
4661       loop_vinfo->add_stmt (incr);
4662
4663       /* Copy the points-to information if it exists. */
4664       if (DR_PTR_INFO (dr))
4665         {
4666           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4667           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4668         }
4669       if (ptr_incr)
4670         *ptr_incr = incr;
4671
4672       aptr = indx_before_incr;
4673     }
4674
4675   if (!nested_in_vect_loop || only_init)
4676     return aptr;
4677
4678
4679   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4680      nested in LOOP, if exists.  */
4681
4682   gcc_assert (nested_in_vect_loop);
4683   if (!only_init)
4684     {
4685       standard_iv_increment_position (containing_loop, &incr_gsi,
4686                                       &insert_after);
4687       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4688                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4689                  &indx_after_incr);
4690       incr = gsi_stmt (incr_gsi);
4691       loop_vinfo->add_stmt (incr);
4692
4693       /* Copy the points-to information if it exists. */
4694       if (DR_PTR_INFO (dr))
4695         {
4696           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4697           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4698         }
4699       if (ptr_incr)
4700         *ptr_incr = incr;
4701
4702       return indx_before_incr;
4703     }
4704   else
4705     gcc_unreachable ();
4706 }
4707
4708
4709 /* Function bump_vector_ptr
4710
4711    Increment a pointer (to a vector type) by vector-size. If requested,
4712    i.e. if PTR-INCR is given, then also connect the new increment stmt
4713    to the existing def-use update-chain of the pointer, by modifying
4714    the PTR_INCR as illustrated below:
4715
4716    The pointer def-use update-chain before this function:
4717                         DATAREF_PTR = phi (p_0, p_2)
4718                         ....
4719         PTR_INCR:       p_2 = DATAREF_PTR + step
4720
4721    The pointer def-use update-chain after this function:
4722                         DATAREF_PTR = phi (p_0, p_2)
4723                         ....
4724                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4725                         ....
4726         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4727
4728    Input:
4729    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4730                  in the loop.
4731    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4732               the loop.  The increment amount across iterations is expected
4733               to be vector_size.
4734    BSI - location where the new update stmt is to be placed.
4735    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4736    BUMP - optional. The offset by which to bump the pointer. If not given,
4737           the offset is assumed to be vector_size.
4738
4739    Output: Return NEW_DATAREF_PTR as illustrated above.
4740
4741 */
4742
4743 tree
4744 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4745                  stmt_vec_info stmt_info, tree bump)
4746 {
4747   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4748   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4749   tree update = TYPE_SIZE_UNIT (vectype);
4750   gassign *incr_stmt;
4751   ssa_op_iter iter;
4752   use_operand_p use_p;
4753   tree new_dataref_ptr;
4754
4755   if (bump)
4756     update = bump;
4757
4758   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4759     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4760   else
4761     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4762   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4763                                    dataref_ptr, update);
4764   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4765
4766   /* Copy the points-to information if it exists. */
4767   if (DR_PTR_INFO (dr))
4768     {
4769       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4770       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4771     }
4772
4773   if (!ptr_incr)
4774     return new_dataref_ptr;
4775
4776   /* Update the vector-pointer's cross-iteration increment.  */
4777   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4778     {
4779       tree use = USE_FROM_PTR (use_p);
4780
4781       if (use == dataref_ptr)
4782         SET_USE (use_p, new_dataref_ptr);
4783       else
4784         gcc_assert (operand_equal_p (use, update, 0));
4785     }
4786
4787   return new_dataref_ptr;
4788 }
4789
4790
4791 /* Copy memory reference info such as base/clique from the SRC reference
4792    to the DEST MEM_REF.  */
4793
4794 void
4795 vect_copy_ref_info (tree dest, tree src)
4796 {
4797   if (TREE_CODE (dest) != MEM_REF)
4798     return;
4799
4800   tree src_base = src;
4801   while (handled_component_p (src_base))
4802     src_base = TREE_OPERAND (src_base, 0);
4803   if (TREE_CODE (src_base) != MEM_REF
4804       && TREE_CODE (src_base) != TARGET_MEM_REF)
4805     return;
4806
4807   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4808   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4809 }
4810
4811
4812 /* Function vect_create_destination_var.
4813
4814    Create a new temporary of type VECTYPE.  */
4815
4816 tree
4817 vect_create_destination_var (tree scalar_dest, tree vectype)
4818 {
4819   tree vec_dest;
4820   const char *name;
4821   char *new_name;
4822   tree type;
4823   enum vect_var_kind kind;
4824
4825   kind = vectype
4826     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4827     ? vect_mask_var
4828     : vect_simple_var
4829     : vect_scalar_var;
4830   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4831
4832   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4833
4834   name = get_name (scalar_dest);
4835   if (name)
4836     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4837   else
4838     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4839   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4840   free (new_name);
4841
4842   return vec_dest;
4843 }
4844
4845 /* Function vect_grouped_store_supported.
4846
4847    Returns TRUE if interleave high and interleave low permutations
4848    are supported, and FALSE otherwise.  */
4849
4850 bool
4851 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4852 {
4853   machine_mode mode = TYPE_MODE (vectype);
4854
4855   /* vect_permute_store_chain requires the group size to be equal to 3 or
4856      be a power of two.  */
4857   if (count != 3 && exact_log2 (count) == -1)
4858     {
4859       if (dump_enabled_p ())
4860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4861                          "the size of the group of accesses"
4862                          " is not a power of 2 or not eqaul to 3\n");
4863       return false;
4864     }
4865
4866   /* Check that the permutation is supported.  */
4867   if (VECTOR_MODE_P (mode))
4868     {
4869       unsigned int i;
4870       if (count == 3)
4871         {
4872           unsigned int j0 = 0, j1 = 0, j2 = 0;
4873           unsigned int i, j;
4874
4875           unsigned int nelt;
4876           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4877             {
4878               if (dump_enabled_p ())
4879                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4880                                  "cannot handle groups of 3 stores for"
4881                                  " variable-length vectors\n");
4882               return false;
4883             }
4884
4885           vec_perm_builder sel (nelt, nelt, 1);
4886           sel.quick_grow (nelt);
4887           vec_perm_indices indices;
4888           for (j = 0; j < 3; j++)
4889             {
4890               int nelt0 = ((3 - j) * nelt) % 3;
4891               int nelt1 = ((3 - j) * nelt + 1) % 3;
4892               int nelt2 = ((3 - j) * nelt + 2) % 3;
4893               for (i = 0; i < nelt; i++)
4894                 {
4895                   if (3 * i + nelt0 < nelt)
4896                     sel[3 * i + nelt0] = j0++;
4897                   if (3 * i + nelt1 < nelt)
4898                     sel[3 * i + nelt1] = nelt + j1++;
4899                   if (3 * i + nelt2 < nelt)
4900                     sel[3 * i + nelt2] = 0;
4901                 }
4902               indices.new_vector (sel, 2, nelt);
4903               if (!can_vec_perm_const_p (mode, indices))
4904                 {
4905                   if (dump_enabled_p ())
4906                     dump_printf (MSG_MISSED_OPTIMIZATION,
4907                                  "permutation op not supported by target.\n");
4908                   return false;
4909                 }
4910
4911               for (i = 0; i < nelt; i++)
4912                 {
4913                   if (3 * i + nelt0 < nelt)
4914                     sel[3 * i + nelt0] = 3 * i + nelt0;
4915                   if (3 * i + nelt1 < nelt)
4916                     sel[3 * i + nelt1] = 3 * i + nelt1;
4917                   if (3 * i + nelt2 < nelt)
4918                     sel[3 * i + nelt2] = nelt + j2++;
4919                 }
4920               indices.new_vector (sel, 2, nelt);
4921               if (!can_vec_perm_const_p (mode, indices))
4922                 {
4923                   if (dump_enabled_p ())
4924                     dump_printf (MSG_MISSED_OPTIMIZATION,
4925                                  "permutation op not supported by target.\n");
4926                   return false;
4927                 }
4928             }
4929           return true;
4930         }
4931       else
4932         {
4933           /* If length is not equal to 3 then only power of 2 is supported.  */
4934           gcc_assert (pow2p_hwi (count));
4935           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4936
4937           /* The encoding has 2 interleaved stepped patterns.  */
4938           vec_perm_builder sel (nelt, 2, 3);
4939           sel.quick_grow (6);
4940           for (i = 0; i < 3; i++)
4941             {
4942               sel[i * 2] = i;
4943               sel[i * 2 + 1] = i + nelt;
4944             }
4945           vec_perm_indices indices (sel, 2, nelt);
4946           if (can_vec_perm_const_p (mode, indices))
4947             {
4948               for (i = 0; i < 6; i++)
4949                 sel[i] += exact_div (nelt, 2);
4950               indices.new_vector (sel, 2, nelt);
4951               if (can_vec_perm_const_p (mode, indices))
4952                 return true;
4953             }
4954         }
4955     }
4956
4957   if (dump_enabled_p ())
4958     dump_printf (MSG_MISSED_OPTIMIZATION,
4959                  "permutation op not supported by target.\n");
4960   return false;
4961 }
4962
4963
4964 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
4965    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
4966
4967 bool
4968 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
4969                             bool masked_p)
4970 {
4971   if (masked_p)
4972     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
4973                                          vec_mask_store_lanes_optab,
4974                                          vectype, count);
4975   else
4976     return vect_lanes_optab_supported_p ("vec_store_lanes",
4977                                          vec_store_lanes_optab,
4978                                          vectype, count);
4979 }
4980
4981
4982 /* Function vect_permute_store_chain.
4983
4984    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4985    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4986    the data correctly for the stores.  Return the final references for stores
4987    in RESULT_CHAIN.
4988
4989    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4990    The input is 4 vectors each containing 8 elements.  We assign a number to
4991    each element, the input sequence is:
4992
4993    1st vec:   0  1  2  3  4  5  6  7
4994    2nd vec:   8  9 10 11 12 13 14 15
4995    3rd vec:  16 17 18 19 20 21 22 23
4996    4th vec:  24 25 26 27 28 29 30 31
4997
4998    The output sequence should be:
4999
5000    1st vec:  0  8 16 24  1  9 17 25
5001    2nd vec:  2 10 18 26  3 11 19 27
5002    3rd vec:  4 12 20 28  5 13 21 30
5003    4th vec:  6 14 22 30  7 15 23 31
5004
5005    i.e., we interleave the contents of the four vectors in their order.
5006
5007    We use interleave_high/low instructions to create such output.  The input of
5008    each interleave_high/low operation is two vectors:
5009    1st vec    2nd vec
5010    0 1 2 3    4 5 6 7
5011    the even elements of the result vector are obtained left-to-right from the
5012    high/low elements of the first vector.  The odd elements of the result are
5013    obtained left-to-right from the high/low elements of the second vector.
5014    The output of interleave_high will be:   0 4 1 5
5015    and of interleave_low:                   2 6 3 7
5016
5017
5018    The permutation is done in log LENGTH stages.  In each stage interleave_high
5019    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5020    where the first argument is taken from the first half of DR_CHAIN and the
5021    second argument from it's second half.
5022    In our example,
5023
5024    I1: interleave_high (1st vec, 3rd vec)
5025    I2: interleave_low (1st vec, 3rd vec)
5026    I3: interleave_high (2nd vec, 4th vec)
5027    I4: interleave_low (2nd vec, 4th vec)
5028
5029    The output for the first stage is:
5030
5031    I1:  0 16  1 17  2 18  3 19
5032    I2:  4 20  5 21  6 22  7 23
5033    I3:  8 24  9 25 10 26 11 27
5034    I4: 12 28 13 29 14 30 15 31
5035
5036    The output of the second stage, i.e. the final result is:
5037
5038    I1:  0  8 16 24  1  9 17 25
5039    I2:  2 10 18 26  3 11 19 27
5040    I3:  4 12 20 28  5 13 21 30
5041    I4:  6 14 22 30  7 15 23 31.  */
5042
5043 void
5044 vect_permute_store_chain (vec<tree> dr_chain,
5045                           unsigned int length,
5046                           stmt_vec_info stmt_info,
5047                           gimple_stmt_iterator *gsi,
5048                           vec<tree> *result_chain)
5049 {
5050   tree vect1, vect2, high, low;
5051   gimple *perm_stmt;
5052   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5053   tree perm_mask_low, perm_mask_high;
5054   tree data_ref;
5055   tree perm3_mask_low, perm3_mask_high;
5056   unsigned int i, j, n, log_length = exact_log2 (length);
5057
5058   result_chain->quick_grow (length);
5059   memcpy (result_chain->address (), dr_chain.address (),
5060           length * sizeof (tree));
5061
5062   if (length == 3)
5063     {
5064       /* vect_grouped_store_supported ensures that this is constant.  */
5065       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5066       unsigned int j0 = 0, j1 = 0, j2 = 0;
5067
5068       vec_perm_builder sel (nelt, nelt, 1);
5069       sel.quick_grow (nelt);
5070       vec_perm_indices indices;
5071       for (j = 0; j < 3; j++)
5072         {
5073           int nelt0 = ((3 - j) * nelt) % 3;
5074           int nelt1 = ((3 - j) * nelt + 1) % 3;
5075           int nelt2 = ((3 - j) * nelt + 2) % 3;
5076
5077           for (i = 0; i < nelt; i++)
5078             {
5079               if (3 * i + nelt0 < nelt)
5080                 sel[3 * i + nelt0] = j0++;
5081               if (3 * i + nelt1 < nelt)
5082                 sel[3 * i + nelt1] = nelt + j1++;
5083               if (3 * i + nelt2 < nelt)
5084                 sel[3 * i + nelt2] = 0;
5085             }
5086           indices.new_vector (sel, 2, nelt);
5087           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5088
5089           for (i = 0; i < nelt; i++)
5090             {
5091               if (3 * i + nelt0 < nelt)
5092                 sel[3 * i + nelt0] = 3 * i + nelt0;
5093               if (3 * i + nelt1 < nelt)
5094                 sel[3 * i + nelt1] = 3 * i + nelt1;
5095               if (3 * i + nelt2 < nelt)
5096                 sel[3 * i + nelt2] = nelt + j2++;
5097             }
5098           indices.new_vector (sel, 2, nelt);
5099           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5100
5101           vect1 = dr_chain[0];
5102           vect2 = dr_chain[1];
5103
5104           /* Create interleaving stmt:
5105              low = VEC_PERM_EXPR <vect1, vect2,
5106                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5107                                    j + 2, nelt + j + 2, *, ...}>  */
5108           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5109           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5110                                            vect2, perm3_mask_low);
5111           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5112
5113           vect1 = data_ref;
5114           vect2 = dr_chain[2];
5115           /* Create interleaving stmt:
5116              low = VEC_PERM_EXPR <vect1, vect2,
5117                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5118                                    6, 7, nelt + j + 2, ...}>  */
5119           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5120           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5121                                            vect2, perm3_mask_high);
5122           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5123           (*result_chain)[j] = data_ref;
5124         }
5125     }
5126   else
5127     {
5128       /* If length is not equal to 3 then only power of 2 is supported.  */
5129       gcc_assert (pow2p_hwi (length));
5130
5131       /* The encoding has 2 interleaved stepped patterns.  */
5132       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5133       vec_perm_builder sel (nelt, 2, 3);
5134       sel.quick_grow (6);
5135       for (i = 0; i < 3; i++)
5136         {
5137           sel[i * 2] = i;
5138           sel[i * 2 + 1] = i + nelt;
5139         }
5140         vec_perm_indices indices (sel, 2, nelt);
5141         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5142
5143         for (i = 0; i < 6; i++)
5144           sel[i] += exact_div (nelt, 2);
5145         indices.new_vector (sel, 2, nelt);
5146         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5147
5148         for (i = 0, n = log_length; i < n; i++)
5149           {
5150             for (j = 0; j < length/2; j++)
5151               {
5152                 vect1 = dr_chain[j];
5153                 vect2 = dr_chain[j+length/2];
5154
5155                 /* Create interleaving stmt:
5156                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5157                                                         ...}>  */
5158                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5159                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5160                                                  vect2, perm_mask_high);
5161                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5162                 (*result_chain)[2*j] = high;
5163
5164                 /* Create interleaving stmt:
5165                    low = VEC_PERM_EXPR <vect1, vect2,
5166                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5167                                          ...}>  */
5168                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5169                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5170                                                  vect2, perm_mask_low);
5171                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5172                 (*result_chain)[2*j+1] = low;
5173               }
5174             memcpy (dr_chain.address (), result_chain->address (),
5175                     length * sizeof (tree));
5176           }
5177     }
5178 }
5179
5180 /* Function vect_setup_realignment
5181
5182    This function is called when vectorizing an unaligned load using
5183    the dr_explicit_realign[_optimized] scheme.
5184    This function generates the following code at the loop prolog:
5185
5186       p = initial_addr;
5187    x  msq_init = *(floor(p));   # prolog load
5188       realignment_token = call target_builtin;
5189     loop:
5190    x  msq = phi (msq_init, ---)
5191
5192    The stmts marked with x are generated only for the case of
5193    dr_explicit_realign_optimized.
5194
5195    The code above sets up a new (vector) pointer, pointing to the first
5196    location accessed by STMT_INFO, and a "floor-aligned" load using that
5197    pointer.  It also generates code to compute the "realignment-token"
5198    (if the relevant target hook was defined), and creates a phi-node at the
5199    loop-header bb whose arguments are the result of the prolog-load (created
5200    by this function) and the result of a load that takes place in the loop
5201    (to be created by the caller to this function).
5202
5203    For the case of dr_explicit_realign_optimized:
5204    The caller to this function uses the phi-result (msq) to create the
5205    realignment code inside the loop, and sets up the missing phi argument,
5206    as follows:
5207     loop:
5208       msq = phi (msq_init, lsq)
5209       lsq = *(floor(p'));        # load in loop
5210       result = realign_load (msq, lsq, realignment_token);
5211
5212    For the case of dr_explicit_realign:
5213     loop:
5214       msq = *(floor(p));        # load in loop
5215       p' = p + (VS-1);
5216       lsq = *(floor(p'));       # load in loop
5217       result = realign_load (msq, lsq, realignment_token);
5218
5219    Input:
5220    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5221                a memory location that may be unaligned.
5222    BSI - place where new code is to be inserted.
5223    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5224                               is used.
5225
5226    Output:
5227    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5228                        target hook, if defined.
5229    Return value - the result of the loop-header phi node.  */
5230
5231 tree
5232 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5233                         tree *realignment_token,
5234                         enum dr_alignment_support alignment_support_scheme,
5235                         tree init_addr,
5236                         struct loop **at_loop)
5237 {
5238   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5239   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5240   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5241   struct data_reference *dr = dr_info->dr;
5242   struct loop *loop = NULL;
5243   edge pe = NULL;
5244   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5245   tree vec_dest;
5246   gimple *inc;
5247   tree ptr;
5248   tree data_ref;
5249   basic_block new_bb;
5250   tree msq_init = NULL_TREE;
5251   tree new_temp;
5252   gphi *phi_stmt;
5253   tree msq = NULL_TREE;
5254   gimple_seq stmts = NULL;
5255   bool compute_in_loop = false;
5256   bool nested_in_vect_loop = false;
5257   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5258   struct loop *loop_for_initial_load = NULL;
5259
5260   if (loop_vinfo)
5261     {
5262       loop = LOOP_VINFO_LOOP (loop_vinfo);
5263       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5264     }
5265
5266   gcc_assert (alignment_support_scheme == dr_explicit_realign
5267               || alignment_support_scheme == dr_explicit_realign_optimized);
5268
5269   /* We need to generate three things:
5270      1. the misalignment computation
5271      2. the extra vector load (for the optimized realignment scheme).
5272      3. the phi node for the two vectors from which the realignment is
5273       done (for the optimized realignment scheme).  */
5274
5275   /* 1. Determine where to generate the misalignment computation.
5276
5277      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5278      calculation will be generated by this function, outside the loop (in the
5279      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5280      caller, inside the loop.
5281
5282      Background: If the misalignment remains fixed throughout the iterations of
5283      the loop, then both realignment schemes are applicable, and also the
5284      misalignment computation can be done outside LOOP.  This is because we are
5285      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5286      are a multiple of VS (the Vector Size), and therefore the misalignment in
5287      different vectorized LOOP iterations is always the same.
5288      The problem arises only if the memory access is in an inner-loop nested
5289      inside LOOP, which is now being vectorized using outer-loop vectorization.
5290      This is the only case when the misalignment of the memory access may not
5291      remain fixed throughout the iterations of the inner-loop (as explained in
5292      detail in vect_supportable_dr_alignment).  In this case, not only is the
5293      optimized realignment scheme not applicable, but also the misalignment
5294      computation (and generation of the realignment token that is passed to
5295      REALIGN_LOAD) have to be done inside the loop.
5296
5297      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5298      or not, which in turn determines if the misalignment is computed inside
5299      the inner-loop, or outside LOOP.  */
5300
5301   if (init_addr != NULL_TREE || !loop_vinfo)
5302     {
5303       compute_in_loop = true;
5304       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5305     }
5306
5307
5308   /* 2. Determine where to generate the extra vector load.
5309
5310      For the optimized realignment scheme, instead of generating two vector
5311      loads in each iteration, we generate a single extra vector load in the
5312      preheader of the loop, and in each iteration reuse the result of the
5313      vector load from the previous iteration.  In case the memory access is in
5314      an inner-loop nested inside LOOP, which is now being vectorized using
5315      outer-loop vectorization, we need to determine whether this initial vector
5316      load should be generated at the preheader of the inner-loop, or can be
5317      generated at the preheader of LOOP.  If the memory access has no evolution
5318      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5319      to be generated inside LOOP (in the preheader of the inner-loop).  */
5320
5321   if (nested_in_vect_loop)
5322     {
5323       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5324       bool invariant_in_outerloop =
5325             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5326       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5327     }
5328   else
5329     loop_for_initial_load = loop;
5330   if (at_loop)
5331     *at_loop = loop_for_initial_load;
5332
5333   if (loop_for_initial_load)
5334     pe = loop_preheader_edge (loop_for_initial_load);
5335
5336   /* 3. For the case of the optimized realignment, create the first vector
5337       load at the loop preheader.  */
5338
5339   if (alignment_support_scheme == dr_explicit_realign_optimized)
5340     {
5341       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5342       gassign *new_stmt;
5343
5344       gcc_assert (!compute_in_loop);
5345       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5346       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5347                                       loop_for_initial_load, NULL_TREE,
5348                                       &init_addr, NULL, &inc, true);
5349       if (TREE_CODE (ptr) == SSA_NAME)
5350         new_temp = copy_ssa_name (ptr);
5351       else
5352         new_temp = make_ssa_name (TREE_TYPE (ptr));
5353       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5354       new_stmt = gimple_build_assign
5355                    (new_temp, BIT_AND_EXPR, ptr,
5356                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5357       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5358       gcc_assert (!new_bb);
5359       data_ref
5360         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5361                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5362       vect_copy_ref_info (data_ref, DR_REF (dr));
5363       new_stmt = gimple_build_assign (vec_dest, data_ref);
5364       new_temp = make_ssa_name (vec_dest, new_stmt);
5365       gimple_assign_set_lhs (new_stmt, new_temp);
5366       if (pe)
5367         {
5368           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5369           gcc_assert (!new_bb);
5370         }
5371       else
5372          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5373
5374       msq_init = gimple_assign_lhs (new_stmt);
5375     }
5376
5377   /* 4. Create realignment token using a target builtin, if available.
5378       It is done either inside the containing loop, or before LOOP (as
5379       determined above).  */
5380
5381   if (targetm.vectorize.builtin_mask_for_load)
5382     {
5383       gcall *new_stmt;
5384       tree builtin_decl;
5385
5386       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5387       if (!init_addr)
5388         {
5389           /* Generate the INIT_ADDR computation outside LOOP.  */
5390           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5391                                                             NULL_TREE);
5392           if (loop)
5393             {
5394               pe = loop_preheader_edge (loop);
5395               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5396               gcc_assert (!new_bb);
5397             }
5398           else
5399              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5400         }
5401
5402       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5403       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5404       vec_dest =
5405         vect_create_destination_var (scalar_dest,
5406                                      gimple_call_return_type (new_stmt));
5407       new_temp = make_ssa_name (vec_dest, new_stmt);
5408       gimple_call_set_lhs (new_stmt, new_temp);
5409
5410       if (compute_in_loop)
5411         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5412       else
5413         {
5414           /* Generate the misalignment computation outside LOOP.  */
5415           pe = loop_preheader_edge (loop);
5416           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5417           gcc_assert (!new_bb);
5418         }
5419
5420       *realignment_token = gimple_call_lhs (new_stmt);
5421
5422       /* The result of the CALL_EXPR to this builtin is determined from
5423          the value of the parameter and no global variables are touched
5424          which makes the builtin a "const" function.  Requiring the
5425          builtin to have the "const" attribute makes it unnecessary
5426          to call mark_call_clobbered.  */
5427       gcc_assert (TREE_READONLY (builtin_decl));
5428     }
5429
5430   if (alignment_support_scheme == dr_explicit_realign)
5431     return msq;
5432
5433   gcc_assert (!compute_in_loop);
5434   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5435
5436
5437   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5438
5439   pe = loop_preheader_edge (containing_loop);
5440   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5441   msq = make_ssa_name (vec_dest);
5442   phi_stmt = create_phi_node (msq, containing_loop->header);
5443   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5444
5445   return msq;
5446 }
5447
5448
5449 /* Function vect_grouped_load_supported.
5450
5451    COUNT is the size of the load group (the number of statements plus the
5452    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5453    only one statement, with a gap of COUNT - 1.
5454
5455    Returns true if a suitable permute exists.  */
5456
5457 bool
5458 vect_grouped_load_supported (tree vectype, bool single_element_p,
5459                              unsigned HOST_WIDE_INT count)
5460 {
5461   machine_mode mode = TYPE_MODE (vectype);
5462
5463   /* If this is single-element interleaving with an element distance
5464      that leaves unused vector loads around punt - we at least create
5465      very sub-optimal code in that case (and blow up memory,
5466      see PR65518).  */
5467   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5468     {
5469       if (dump_enabled_p ())
5470         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5471                          "single-element interleaving not supported "
5472                          "for not adjacent vector loads\n");
5473       return false;
5474     }
5475
5476   /* vect_permute_load_chain requires the group size to be equal to 3 or
5477      be a power of two.  */
5478   if (count != 3 && exact_log2 (count) == -1)
5479     {
5480       if (dump_enabled_p ())
5481         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5482                          "the size of the group of accesses"
5483                          " is not a power of 2 or not equal to 3\n");
5484       return false;
5485     }
5486
5487   /* Check that the permutation is supported.  */
5488   if (VECTOR_MODE_P (mode))
5489     {
5490       unsigned int i, j;
5491       if (count == 3)
5492         {
5493           unsigned int nelt;
5494           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5495             {
5496               if (dump_enabled_p ())
5497                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498                                  "cannot handle groups of 3 loads for"
5499                                  " variable-length vectors\n");
5500               return false;
5501             }
5502
5503           vec_perm_builder sel (nelt, nelt, 1);
5504           sel.quick_grow (nelt);
5505           vec_perm_indices indices;
5506           unsigned int k;
5507           for (k = 0; k < 3; k++)
5508             {
5509               for (i = 0; i < nelt; i++)
5510                 if (3 * i + k < 2 * nelt)
5511                   sel[i] = 3 * i + k;
5512                 else
5513                   sel[i] = 0;
5514               indices.new_vector (sel, 2, nelt);
5515               if (!can_vec_perm_const_p (mode, indices))
5516                 {
5517                   if (dump_enabled_p ())
5518                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5519                                      "shuffle of 3 loads is not supported by"
5520                                      " target\n");
5521                   return false;
5522                 }
5523               for (i = 0, j = 0; i < nelt; i++)
5524                 if (3 * i + k < 2 * nelt)
5525                   sel[i] = i;
5526                 else
5527                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5528               indices.new_vector (sel, 2, nelt);
5529               if (!can_vec_perm_const_p (mode, indices))
5530                 {
5531                   if (dump_enabled_p ())
5532                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533                                      "shuffle of 3 loads is not supported by"
5534                                      " target\n");
5535                   return false;
5536                 }
5537             }
5538           return true;
5539         }
5540       else
5541         {
5542           /* If length is not equal to 3 then only power of 2 is supported.  */
5543           gcc_assert (pow2p_hwi (count));
5544           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5545
5546           /* The encoding has a single stepped pattern.  */
5547           vec_perm_builder sel (nelt, 1, 3);
5548           sel.quick_grow (3);
5549           for (i = 0; i < 3; i++)
5550             sel[i] = i * 2;
5551           vec_perm_indices indices (sel, 2, nelt);
5552           if (can_vec_perm_const_p (mode, indices))
5553             {
5554               for (i = 0; i < 3; i++)
5555                 sel[i] = i * 2 + 1;
5556               indices.new_vector (sel, 2, nelt);
5557               if (can_vec_perm_const_p (mode, indices))
5558                 return true;
5559             }
5560         }
5561     }
5562
5563   if (dump_enabled_p ())
5564     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565                      "extract even/odd not supported by target\n");
5566   return false;
5567 }
5568
5569 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5570    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5571
5572 bool
5573 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5574                            bool masked_p)
5575 {
5576   if (masked_p)
5577     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5578                                          vec_mask_load_lanes_optab,
5579                                          vectype, count);
5580   else
5581     return vect_lanes_optab_supported_p ("vec_load_lanes",
5582                                          vec_load_lanes_optab,
5583                                          vectype, count);
5584 }
5585
5586 /* Function vect_permute_load_chain.
5587
5588    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5589    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5590    the input data correctly.  Return the final references for loads in
5591    RESULT_CHAIN.
5592
5593    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5594    The input is 4 vectors each containing 8 elements. We assign a number to each
5595    element, the input sequence is:
5596
5597    1st vec:   0  1  2  3  4  5  6  7
5598    2nd vec:   8  9 10 11 12 13 14 15
5599    3rd vec:  16 17 18 19 20 21 22 23
5600    4th vec:  24 25 26 27 28 29 30 31
5601
5602    The output sequence should be:
5603
5604    1st vec:  0 4  8 12 16 20 24 28
5605    2nd vec:  1 5  9 13 17 21 25 29
5606    3rd vec:  2 6 10 14 18 22 26 30
5607    4th vec:  3 7 11 15 19 23 27 31
5608
5609    i.e., the first output vector should contain the first elements of each
5610    interleaving group, etc.
5611
5612    We use extract_even/odd instructions to create such output.  The input of
5613    each extract_even/odd operation is two vectors
5614    1st vec    2nd vec
5615    0 1 2 3    4 5 6 7
5616
5617    and the output is the vector of extracted even/odd elements.  The output of
5618    extract_even will be:   0 2 4 6
5619    and of extract_odd:     1 3 5 7
5620
5621
5622    The permutation is done in log LENGTH stages.  In each stage extract_even
5623    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5624    their order.  In our example,
5625
5626    E1: extract_even (1st vec, 2nd vec)
5627    E2: extract_odd (1st vec, 2nd vec)
5628    E3: extract_even (3rd vec, 4th vec)
5629    E4: extract_odd (3rd vec, 4th vec)
5630
5631    The output for the first stage will be:
5632
5633    E1:  0  2  4  6  8 10 12 14
5634    E2:  1  3  5  7  9 11 13 15
5635    E3: 16 18 20 22 24 26 28 30
5636    E4: 17 19 21 23 25 27 29 31
5637
5638    In order to proceed and create the correct sequence for the next stage (or
5639    for the correct output, if the second stage is the last one, as in our
5640    example), we first put the output of extract_even operation and then the
5641    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5642    The input for the second stage is:
5643
5644    1st vec (E1):  0  2  4  6  8 10 12 14
5645    2nd vec (E3): 16 18 20 22 24 26 28 30
5646    3rd vec (E2):  1  3  5  7  9 11 13 15
5647    4th vec (E4): 17 19 21 23 25 27 29 31
5648
5649    The output of the second stage:
5650
5651    E1: 0 4  8 12 16 20 24 28
5652    E2: 2 6 10 14 18 22 26 30
5653    E3: 1 5  9 13 17 21 25 29
5654    E4: 3 7 11 15 19 23 27 31
5655
5656    And RESULT_CHAIN after reordering:
5657
5658    1st vec (E1):  0 4  8 12 16 20 24 28
5659    2nd vec (E3):  1 5  9 13 17 21 25 29
5660    3rd vec (E2):  2 6 10 14 18 22 26 30
5661    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5662
5663 static void
5664 vect_permute_load_chain (vec<tree> dr_chain,
5665                          unsigned int length,
5666                          stmt_vec_info stmt_info,
5667                          gimple_stmt_iterator *gsi,
5668                          vec<tree> *result_chain)
5669 {
5670   tree data_ref, first_vect, second_vect;
5671   tree perm_mask_even, perm_mask_odd;
5672   tree perm3_mask_low, perm3_mask_high;
5673   gimple *perm_stmt;
5674   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5675   unsigned int i, j, log_length = exact_log2 (length);
5676
5677   result_chain->quick_grow (length);
5678   memcpy (result_chain->address (), dr_chain.address (),
5679           length * sizeof (tree));
5680
5681   if (length == 3)
5682     {
5683       /* vect_grouped_load_supported ensures that this is constant.  */
5684       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5685       unsigned int k;
5686
5687       vec_perm_builder sel (nelt, nelt, 1);
5688       sel.quick_grow (nelt);
5689       vec_perm_indices indices;
5690       for (k = 0; k < 3; k++)
5691         {
5692           for (i = 0; i < nelt; i++)
5693             if (3 * i + k < 2 * nelt)
5694               sel[i] = 3 * i + k;
5695             else
5696               sel[i] = 0;
5697           indices.new_vector (sel, 2, nelt);
5698           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5699
5700           for (i = 0, j = 0; i < nelt; i++)
5701             if (3 * i + k < 2 * nelt)
5702               sel[i] = i;
5703             else
5704               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5705           indices.new_vector (sel, 2, nelt);
5706           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5707
5708           first_vect = dr_chain[0];
5709           second_vect = dr_chain[1];
5710
5711           /* Create interleaving stmt (low part of):
5712              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5713                                                              ...}>  */
5714           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5715           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5716                                            second_vect, perm3_mask_low);
5717           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5718
5719           /* Create interleaving stmt (high part of):
5720              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5721                                                               ...}>  */
5722           first_vect = data_ref;
5723           second_vect = dr_chain[2];
5724           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5725           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5726                                            second_vect, perm3_mask_high);
5727           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5728           (*result_chain)[k] = data_ref;
5729         }
5730     }
5731   else
5732     {
5733       /* If length is not equal to 3 then only power of 2 is supported.  */
5734       gcc_assert (pow2p_hwi (length));
5735
5736       /* The encoding has a single stepped pattern.  */
5737       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5738       vec_perm_builder sel (nelt, 1, 3);
5739       sel.quick_grow (3);
5740       for (i = 0; i < 3; ++i)
5741         sel[i] = i * 2;
5742       vec_perm_indices indices (sel, 2, nelt);
5743       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5744
5745       for (i = 0; i < 3; ++i)
5746         sel[i] = i * 2 + 1;
5747       indices.new_vector (sel, 2, nelt);
5748       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5749
5750       for (i = 0; i < log_length; i++)
5751         {
5752           for (j = 0; j < length; j += 2)
5753             {
5754               first_vect = dr_chain[j];
5755               second_vect = dr_chain[j+1];
5756
5757               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5758               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5759               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5760                                                first_vect, second_vect,
5761                                                perm_mask_even);
5762               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5763               (*result_chain)[j/2] = data_ref;
5764
5765               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5766               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5767               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5768                                                first_vect, second_vect,
5769                                                perm_mask_odd);
5770               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5771               (*result_chain)[j/2+length/2] = data_ref;
5772             }
5773           memcpy (dr_chain.address (), result_chain->address (),
5774                   length * sizeof (tree));
5775         }
5776     }
5777 }
5778
5779 /* Function vect_shift_permute_load_chain.
5780
5781    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5782    sequence of stmts to reorder the input data accordingly.
5783    Return the final references for loads in RESULT_CHAIN.
5784    Return true if successed, false otherwise.
5785
5786    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5787    The input is 3 vectors each containing 8 elements.  We assign a
5788    number to each element, the input sequence is:
5789
5790    1st vec:   0  1  2  3  4  5  6  7
5791    2nd vec:   8  9 10 11 12 13 14 15
5792    3rd vec:  16 17 18 19 20 21 22 23
5793
5794    The output sequence should be:
5795
5796    1st vec:  0 3 6  9 12 15 18 21
5797    2nd vec:  1 4 7 10 13 16 19 22
5798    3rd vec:  2 5 8 11 14 17 20 23
5799
5800    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5801
5802    First we shuffle all 3 vectors to get correct elements order:
5803
5804    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5805    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5806    3rd vec:  (16 19 22) (17 20 23) (18 21)
5807
5808    Next we unite and shift vector 3 times:
5809
5810    1st step:
5811      shift right by 6 the concatenation of:
5812      "1st vec" and  "2nd vec"
5813        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5814      "2nd vec" and  "3rd vec"
5815        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5816      "3rd vec" and  "1st vec"
5817        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5818                              | New vectors                   |
5819
5820      So that now new vectors are:
5821
5822      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5823      2nd vec:  (10 13) (16 19 22) (17 20 23)
5824      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5825
5826    2nd step:
5827      shift right by 5 the concatenation of:
5828      "1st vec" and  "3rd vec"
5829        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5830      "2nd vec" and  "1st vec"
5831        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5832      "3rd vec" and  "2nd vec"
5833        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5834                           | New vectors                   |
5835
5836      So that now new vectors are:
5837
5838      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5839      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5840      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5841
5842    3rd step:
5843      shift right by 5 the concatenation of:
5844      "1st vec" and  "1st vec"
5845        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5846      shift right by 3 the concatenation of:
5847      "2nd vec" and  "2nd vec"
5848                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5849                           | New vectors                   |
5850
5851      So that now all vectors are READY:
5852      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5853      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5854      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5855
5856    This algorithm is faster than one in vect_permute_load_chain if:
5857      1.  "shift of a concatination" is faster than general permutation.
5858          This is usually so.
5859      2.  The TARGET machine can't execute vector instructions in parallel.
5860          This is because each step of the algorithm depends on previous.
5861          The algorithm in vect_permute_load_chain is much more parallel.
5862
5863    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5864 */
5865
5866 static bool
5867 vect_shift_permute_load_chain (vec<tree> dr_chain,
5868                                unsigned int length,
5869                                stmt_vec_info stmt_info,
5870                                gimple_stmt_iterator *gsi,
5871                                vec<tree> *result_chain)
5872 {
5873   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5874   tree perm2_mask1, perm2_mask2, perm3_mask;
5875   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5876   gimple *perm_stmt;
5877
5878   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5879   unsigned int i;
5880   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5881
5882   unsigned HOST_WIDE_INT nelt, vf;
5883   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5884       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5885     /* Not supported for variable-length vectors.  */
5886     return false;
5887
5888   vec_perm_builder sel (nelt, nelt, 1);
5889   sel.quick_grow (nelt);
5890
5891   result_chain->quick_grow (length);
5892   memcpy (result_chain->address (), dr_chain.address (),
5893           length * sizeof (tree));
5894
5895   if (pow2p_hwi (length) && vf > 4)
5896     {
5897       unsigned int j, log_length = exact_log2 (length);
5898       for (i = 0; i < nelt / 2; ++i)
5899         sel[i] = i * 2;
5900       for (i = 0; i < nelt / 2; ++i)
5901         sel[nelt / 2 + i] = i * 2 + 1;
5902       vec_perm_indices indices (sel, 2, nelt);
5903       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5904         {
5905           if (dump_enabled_p ())
5906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5907                              "shuffle of 2 fields structure is not \
5908                               supported by target\n");
5909           return false;
5910         }
5911       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5912
5913       for (i = 0; i < nelt / 2; ++i)
5914         sel[i] = i * 2 + 1;
5915       for (i = 0; i < nelt / 2; ++i)
5916         sel[nelt / 2 + i] = i * 2;
5917       indices.new_vector (sel, 2, nelt);
5918       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5919         {
5920           if (dump_enabled_p ())
5921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5922                              "shuffle of 2 fields structure is not \
5923                               supported by target\n");
5924           return false;
5925         }
5926       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5927
5928       /* Generating permutation constant to shift all elements.
5929          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5930       for (i = 0; i < nelt; i++)
5931         sel[i] = nelt / 2 + i;
5932       indices.new_vector (sel, 2, nelt);
5933       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5934         {
5935           if (dump_enabled_p ())
5936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5937                              "shift permutation is not supported by target\n");
5938           return false;
5939         }
5940       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5941
5942       /* Generating permutation constant to select vector from 2.
5943          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5944       for (i = 0; i < nelt / 2; i++)
5945         sel[i] = i;
5946       for (i = nelt / 2; i < nelt; i++)
5947         sel[i] = nelt + i;
5948       indices.new_vector (sel, 2, nelt);
5949       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5950         {
5951           if (dump_enabled_p ())
5952             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5953                              "select is not supported by target\n");
5954           return false;
5955         }
5956       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5957
5958       for (i = 0; i < log_length; i++)
5959         {
5960           for (j = 0; j < length; j += 2)
5961             {
5962               first_vect = dr_chain[j];
5963               second_vect = dr_chain[j + 1];
5964
5965               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5966               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5967                                                first_vect, first_vect,
5968                                                perm2_mask1);
5969               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5970               vect[0] = data_ref;
5971
5972               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5973               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5974                                                second_vect, second_vect,
5975                                                perm2_mask2);
5976               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5977               vect[1] = data_ref;
5978
5979               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5980               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5981                                                vect[0], vect[1], shift1_mask);
5982               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5983               (*result_chain)[j/2 + length/2] = data_ref;
5984
5985               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5986               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5987                                                vect[0], vect[1], select_mask);
5988               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5989               (*result_chain)[j/2] = data_ref;
5990             }
5991           memcpy (dr_chain.address (), result_chain->address (),
5992                   length * sizeof (tree));
5993         }
5994       return true;
5995     }
5996   if (length == 3 && vf > 2)
5997     {
5998       unsigned int k = 0, l = 0;
5999
6000       /* Generating permutation constant to get all elements in rigth order.
6001          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6002       for (i = 0; i < nelt; i++)
6003         {
6004           if (3 * k + (l % 3) >= nelt)
6005             {
6006               k = 0;
6007               l += (3 - (nelt % 3));
6008             }
6009           sel[i] = 3 * k + (l % 3);
6010           k++;
6011         }
6012       vec_perm_indices indices (sel, 2, nelt);
6013       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6014         {
6015           if (dump_enabled_p ())
6016             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6017                              "shuffle of 3 fields structure is not \
6018                               supported by target\n");
6019           return false;
6020         }
6021       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6022
6023       /* Generating permutation constant to shift all elements.
6024          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6025       for (i = 0; i < nelt; i++)
6026         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6027       indices.new_vector (sel, 2, nelt);
6028       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6029         {
6030           if (dump_enabled_p ())
6031             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6032                              "shift permutation is not supported by target\n");
6033           return false;
6034         }
6035       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6036
6037       /* Generating permutation constant to shift all elements.
6038          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6039       for (i = 0; i < nelt; i++)
6040         sel[i] = 2 * (nelt / 3) + 1 + i;
6041       indices.new_vector (sel, 2, nelt);
6042       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6043         {
6044           if (dump_enabled_p ())
6045             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6046                              "shift permutation is not supported by target\n");
6047           return false;
6048         }
6049       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6050
6051       /* Generating permutation constant to shift all elements.
6052          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6053       for (i = 0; i < nelt; i++)
6054         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6055       indices.new_vector (sel, 2, nelt);
6056       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6057         {
6058           if (dump_enabled_p ())
6059             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6060                              "shift permutation is not supported by target\n");
6061           return false;
6062         }
6063       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6064
6065       /* Generating permutation constant to shift all elements.
6066          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6067       for (i = 0; i < nelt; i++)
6068         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6069       indices.new_vector (sel, 2, nelt);
6070       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6071         {
6072           if (dump_enabled_p ())
6073             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6074                              "shift permutation is not supported by target\n");
6075           return false;
6076         }
6077       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6078
6079       for (k = 0; k < 3; k++)
6080         {
6081           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6082           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6083                                            dr_chain[k], dr_chain[k],
6084                                            perm3_mask);
6085           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6086           vect[k] = data_ref;
6087         }
6088
6089       for (k = 0; k < 3; k++)
6090         {
6091           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6092           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093                                            vect[k % 3], vect[(k + 1) % 3],
6094                                            shift1_mask);
6095           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6096           vect_shift[k] = data_ref;
6097         }
6098
6099       for (k = 0; k < 3; k++)
6100         {
6101           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6102           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6103                                            vect_shift[(4 - k) % 3],
6104                                            vect_shift[(3 - k) % 3],
6105                                            shift2_mask);
6106           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6107           vect[k] = data_ref;
6108         }
6109
6110       (*result_chain)[3 - (nelt % 3)] = vect[2];
6111
6112       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6113       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6114                                        vect[0], shift3_mask);
6115       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6116       (*result_chain)[nelt % 3] = data_ref;
6117
6118       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6119       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6120                                        vect[1], shift4_mask);
6121       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6122       (*result_chain)[0] = data_ref;
6123       return true;
6124     }
6125   return false;
6126 }
6127
6128 /* Function vect_transform_grouped_load.
6129
6130    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6131    to perform their permutation and ascribe the result vectorized statements to
6132    the scalar statements.
6133 */
6134
6135 void
6136 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6137                              int size, gimple_stmt_iterator *gsi)
6138 {
6139   machine_mode mode;
6140   vec<tree> result_chain = vNULL;
6141
6142   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6143      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6144      vectors, that are ready for vector computation.  */
6145   result_chain.create (size);
6146
6147   /* If reassociation width for vector type is 2 or greater target machine can
6148      execute 2 or more vector instructions in parallel.  Otherwise try to
6149      get chain for loads group using vect_shift_permute_load_chain.  */
6150   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6151   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6152       || pow2p_hwi (size)
6153       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6154                                          gsi, &result_chain))
6155     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6156   vect_record_grouped_load_vectors (stmt_info, result_chain);
6157   result_chain.release ();
6158 }
6159
6160 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6161    generated as part of the vectorization of STMT_INFO.  Assign the statement
6162    for each vector to the associated scalar statement.  */
6163
6164 void
6165 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6166                                   vec<tree> result_chain)
6167 {
6168   vec_info *vinfo = stmt_info->vinfo;
6169   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6170   unsigned int i, gap_count;
6171   tree tmp_data_ref;
6172
6173   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6174      Since we scan the chain starting from it's first node, their order
6175      corresponds the order of data-refs in RESULT_CHAIN.  */
6176   stmt_vec_info next_stmt_info = first_stmt_info;
6177   gap_count = 1;
6178   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6179     {
6180       if (!next_stmt_info)
6181         break;
6182
6183       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6184        code elimination pass later.  No need to check for the first stmt in
6185        the group, since it always exists.
6186        DR_GROUP_GAP is the number of steps in elements from the previous
6187        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6188        correspond to the gaps.  */
6189       if (next_stmt_info != first_stmt_info
6190           && gap_count < DR_GROUP_GAP (next_stmt_info))
6191       {
6192         gap_count++;
6193         continue;
6194       }
6195
6196       while (next_stmt_info)
6197         {
6198           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6199           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6200              copies, and we put the new vector statement in the first available
6201              RELATED_STMT.  */
6202           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6203             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6204           else
6205             {
6206               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6207                 {
6208                   stmt_vec_info prev_stmt_info
6209                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6210                   stmt_vec_info rel_stmt_info
6211                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6212                   while (rel_stmt_info)
6213                     {
6214                       prev_stmt_info = rel_stmt_info;
6215                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6216                     }
6217
6218                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6219                 }
6220             }
6221
6222           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6223           gap_count = 1;
6224           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6225              put the same TMP_DATA_REF as its vectorized statement; otherwise
6226              get the next data-ref from RESULT_CHAIN.  */
6227           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6228             break;
6229         }
6230     }
6231 }
6232
6233 /* Function vect_force_dr_alignment_p.
6234
6235    Returns whether the alignment of a DECL can be forced to be aligned
6236    on ALIGNMENT bit boundary.  */
6237
6238 bool
6239 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6240 {
6241   if (!VAR_P (decl))
6242     return false;
6243
6244   if (decl_in_symtab_p (decl)
6245       && !symtab_node::get (decl)->can_increase_alignment_p ())
6246     return false;
6247
6248   if (TREE_STATIC (decl))
6249     return (alignment <= MAX_OFILE_ALIGNMENT);
6250   else
6251     return (alignment <= MAX_STACK_ALIGNMENT);
6252 }
6253
6254
6255 /* Return whether the data reference DR_INFO is supported with respect to its
6256    alignment.
6257    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6258    it is aligned, i.e., check if it is possible to vectorize it with different
6259    alignment.  */
6260
6261 enum dr_alignment_support
6262 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6263                                bool check_aligned_accesses)
6264 {
6265   data_reference *dr = dr_info->dr;
6266   stmt_vec_info stmt_info = dr_info->stmt;
6267   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6268   machine_mode mode = TYPE_MODE (vectype);
6269   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6270   struct loop *vect_loop = NULL;
6271   bool nested_in_vect_loop = false;
6272
6273   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6274     return dr_aligned;
6275
6276   /* For now assume all conditional loads/stores support unaligned
6277      access without any special code.  */
6278   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6279     if (gimple_call_internal_p (stmt)
6280         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6281             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6282       return dr_unaligned_supported;
6283
6284   if (loop_vinfo)
6285     {
6286       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6287       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6288     }
6289
6290   /* Possibly unaligned access.  */
6291
6292   /* We can choose between using the implicit realignment scheme (generating
6293      a misaligned_move stmt) and the explicit realignment scheme (generating
6294      aligned loads with a REALIGN_LOAD).  There are two variants to the
6295      explicit realignment scheme: optimized, and unoptimized.
6296      We can optimize the realignment only if the step between consecutive
6297      vector loads is equal to the vector size.  Since the vector memory
6298      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6299      is guaranteed that the misalignment amount remains the same throughout the
6300      execution of the vectorized loop.  Therefore, we can create the
6301      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6302      at the loop preheader.
6303
6304      However, in the case of outer-loop vectorization, when vectorizing a
6305      memory access in the inner-loop nested within the LOOP that is now being
6306      vectorized, while it is guaranteed that the misalignment of the
6307      vectorized memory access will remain the same in different outer-loop
6308      iterations, it is *not* guaranteed that is will remain the same throughout
6309      the execution of the inner-loop.  This is because the inner-loop advances
6310      with the original scalar step (and not in steps of VS).  If the inner-loop
6311      step happens to be a multiple of VS, then the misalignment remains fixed
6312      and we can use the optimized realignment scheme.  For example:
6313
6314       for (i=0; i<N; i++)
6315         for (j=0; j<M; j++)
6316           s += a[i+j];
6317
6318      When vectorizing the i-loop in the above example, the step between
6319      consecutive vector loads is 1, and so the misalignment does not remain
6320      fixed across the execution of the inner-loop, and the realignment cannot
6321      be optimized (as illustrated in the following pseudo vectorized loop):
6322
6323       for (i=0; i<N; i+=4)
6324         for (j=0; j<M; j++){
6325           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6326                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6327                          // (assuming that we start from an aligned address).
6328           }
6329
6330      We therefore have to use the unoptimized realignment scheme:
6331
6332       for (i=0; i<N; i+=4)
6333           for (j=k; j<M; j+=4)
6334           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6335                            // that the misalignment of the initial address is
6336                            // 0).
6337
6338      The loop can then be vectorized as follows:
6339
6340       for (k=0; k<4; k++){
6341         rt = get_realignment_token (&vp[k]);
6342         for (i=0; i<N; i+=4){
6343           v1 = vp[i+k];
6344           for (j=k; j<M; j+=4){
6345             v2 = vp[i+j+VS-1];
6346             va = REALIGN_LOAD <v1,v2,rt>;
6347             vs += va;
6348             v1 = v2;
6349           }
6350         }
6351     } */
6352
6353   if (DR_IS_READ (dr))
6354     {
6355       bool is_packed = false;
6356       tree type = (TREE_TYPE (DR_REF (dr)));
6357
6358       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6359           && (!targetm.vectorize.builtin_mask_for_load
6360               || targetm.vectorize.builtin_mask_for_load ()))
6361         {
6362           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6363
6364           /* If we are doing SLP then the accesses need not have the
6365              same alignment, instead it depends on the SLP group size.  */
6366           if (loop_vinfo
6367               && STMT_SLP_TYPE (stmt_info)
6368               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6369                               * (DR_GROUP_SIZE
6370                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6371                               TYPE_VECTOR_SUBPARTS (vectype)))
6372             ;
6373           else if (!loop_vinfo
6374                    || (nested_in_vect_loop
6375                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6376                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6377             return dr_explicit_realign;
6378           else
6379             return dr_explicit_realign_optimized;
6380         }
6381       if (!known_alignment_for_access_p (dr_info))
6382         is_packed = not_size_aligned (DR_REF (dr));
6383
6384       if (targetm.vectorize.support_vector_misalignment
6385             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6386         /* Can't software pipeline the loads, but can at least do them.  */
6387         return dr_unaligned_supported;
6388     }
6389   else
6390     {
6391       bool is_packed = false;
6392       tree type = (TREE_TYPE (DR_REF (dr)));
6393
6394       if (!known_alignment_for_access_p (dr_info))
6395         is_packed = not_size_aligned (DR_REF (dr));
6396
6397      if (targetm.vectorize.support_vector_misalignment
6398            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6399        return dr_unaligned_supported;
6400     }
6401
6402   /* Unsupported.  */
6403   return dr_unaligned_unsupported;
6404 }