gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static opt_result
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return opt_result::failure_at (vect_location,
 166                                    "will not create alias checks, as"
 167                                    " --param vect-max-version-for-alias-checks"
 168                                    " == 0\n");
 169
 170   opt_result res
 171     = runtime_alias_check_p (ddr, loop,
 172                              optimize_loop_nest_for_speed_p (loop));
 173   if (!res)
 174     return res;
 175
 176   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 177   return opt_result::success ();
 178 }
 179
 180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 181
 182 static void
 183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 184 {
 185   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 186   for (unsigned int i = 0; i < checks.length(); ++i)
 187     if (checks[i] == value)
 188       return;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "need run-time check that %T is nonzero\n",
 193                      value);
 194   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 195 }
 196
 197 /* Return true if we know that the order of vectorized DR_INFO_A and
 198    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 199    DR_INFO_B.  At least one of the accesses is a write.  */
 200
 201 static bool
 202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 203 {
 204   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 205   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 206
 207   /* Single statements are always kept in their original order.  */
 208   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 209       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 210     return true;
 211
 212   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 213      group are emitted at the position of the last scalar load and all
 214      stores in a group are emitted at the position of the last scalar store.
 215      Compute that position and check whether the resulting order matches
 216      the current one.  */
 217   stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 218   if (last_a)
 219     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 220          s = DR_GROUP_NEXT_ELEMENT (s))
 221       last_a = get_later_stmt (last_a, s);
 222   else
 223     last_a = stmtinfo_a;
 224   stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 225   if (last_b)
 226     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 227          s = DR_GROUP_NEXT_ELEMENT (s))
 228       last_b = get_later_stmt (last_b, s);
 229   else
 230     last_b = stmtinfo_b;
 231   return ((get_later_stmt (last_a, last_b) == last_a)
 232           == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
 233 }
 234
 235 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 236    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 237    distances.  These distances are conservatively correct but they don't
 238    reflect a guaranteed dependence.
 239
 240    Return true if this function does all the work necessary to avoid
 241    an alias or false if the caller should use the dependence distances
 242    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 243    the depth of the loop described by LOOP_VINFO and the other arguments
 244    are as for vect_analyze_data_ref_dependence.  */
 245
 246 static bool
 247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 248                                        loop_vec_info loop_vinfo,
 249                                        int loop_depth, unsigned int *max_vf)
 250 {
 251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 252   lambda_vector dist_v;
 253   unsigned int i;
 254   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 255     {
 256       int dist = dist_v[loop_depth];
 257       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 258         {
 259           /* If the user asserted safelen >= DIST consecutive iterations
 260              can be executed concurrently, assume independence.
 261
 262              ??? An alternative would be to add the alias check even
 263              in this case, and vectorize the fallback loop with the
 264              maximum VF set to safelen.  However, if the user has
 265              explicitly given a length, it's less likely that that
 266              would be a win.  */
 267           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 268             {
 269               if ((unsigned int) loop->safelen < *max_vf)
 270                 *max_vf = loop->safelen;
 271               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272               continue;
 273             }
 274
 275           /* For dependence distances of 2 or more, we have the option
 276              of limiting VF or checking for an alias at runtime.
 277              Prefer to check at runtime if we can, to avoid limiting
 278              the VF unnecessarily when the bases are in fact independent.
 279
 280              Note that the alias checks will be removed if the VF ends up
 281              being small enough.  */
 282           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 283           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 284           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 285                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 286                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 287         }
 288     }
 289   return true;
 290 }
 291
 292
 293 /* Function vect_analyze_data_ref_dependence.
 294
 295    FIXME: I needed to change the sense of the returned flag.
 296
 297    Return FALSE if there (might) exist a dependence between a memory-reference
 298    DRA and a memory-reference DRB.  When versioning for alias may check a
 299    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 300    the data dependence.  */
 301
 302 static opt_result
 303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 304                                   loop_vec_info loop_vinfo,
 305                                   unsigned int *max_vf)
 306 {
 307   unsigned int i;
 308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   struct data_reference *dra = DDR_A (ddr);
 310   struct data_reference *drb = DDR_B (ddr);
 311   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 312   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 313   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 314   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 315   lambda_vector dist_v;
 316   unsigned int loop_depth;
 317
 318   /* In loop analysis all data references should be vectorizable.  */
 319   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 320       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 321     gcc_unreachable ();
 322
 323   /* Independent data accesses.  */
 324   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 325     return opt_result::success ();
 326
 327   if (dra == drb
 328       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 329     return opt_result::success ();
 330
 331   /* We do not have to consider dependences between accesses that belong
 332      to the same group, unless the stride could be smaller than the
 333      group size.  */
 334   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 335       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 336           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 337       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 338     return opt_result::success ();
 339
 340   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 341      least two scalar iterations, there is always also a true dependence.
 342      As the vectorizer does not re-order loads and stores we can ignore
 343      the anti-dependence if TBAA can disambiguate both DRs similar to the
 344      case with known negative distance anti-dependences (positive
 345      distance anti-dependences would violate TBAA constraints).  */
 346   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 347        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 348       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 349                                  get_alias_set (DR_REF (drb))))
 350     return opt_result::success ();
 351
 352   /* Unknown data dependence.  */
 353   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 354     {
 355       /* If user asserted safelen consecutive iterations can be
 356          executed concurrently, assume independence.  */
 357       if (loop->safelen >= 2)
 358         {
 359           if ((unsigned int) loop->safelen < *max_vf)
 360             *max_vf = loop->safelen;
 361           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 362           return opt_result::success ();
 363         }
 364
 365       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 366           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 367         return opt_result::failure_at
 368           (stmtinfo_a->stmt,
 369            "versioning for alias not supported for: "
 370            "can't determine dependence between %T and %T\n",
 371            DR_REF (dra), DR_REF (drb));
 372
 373       if (dump_enabled_p ())
 374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 375                          "versioning for alias required: "
 376                          "can't determine dependence between %T and %T\n",
 377                          DR_REF (dra), DR_REF (drb));
 378
 379       /* Add to list of ddrs that need to be tested at run-time.  */
 380       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 381     }
 382
 383   /* Known data dependence.  */
 384   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 385     {
 386       /* If user asserted safelen consecutive iterations can be
 387          executed concurrently, assume independence.  */
 388       if (loop->safelen >= 2)
 389         {
 390           if ((unsigned int) loop->safelen < *max_vf)
 391             *max_vf = loop->safelen;
 392           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 393           return opt_result::success ();
 394         }
 395
 396       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 397           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 398         return opt_result::failure_at
 399           (stmtinfo_a->stmt,
 400            "versioning for alias not supported for: "
 401            "bad dist vector for %T and %T\n",
 402            DR_REF (dra), DR_REF (drb));
 403
 404       if (dump_enabled_p ())
 405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 406                          "versioning for alias required: "
 407                          "bad dist vector for %T and %T\n",
 408                          DR_REF (dra), DR_REF (drb));
 409       /* Add to list of ddrs that need to be tested at run-time.  */
 410       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 411     }
 412
 413   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 414
 415   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 416       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 417                                                 loop_depth, max_vf))
 418     return opt_result::success ();
 419
 420   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 421     {
 422       int dist = dist_v[loop_depth];
 423
 424       if (dump_enabled_p ())
 425         dump_printf_loc (MSG_NOTE, vect_location,
 426                          "dependence distance  = %d.\n", dist);
 427
 428       if (dist == 0)
 429         {
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_NOTE, vect_location,
 432                              "dependence distance == 0 between %T and %T\n",
 433                              DR_REF (dra), DR_REF (drb));
 434
 435           /* When we perform grouped accesses and perform implicit CSE
 436              by detecting equal accesses and doing disambiguation with
 437              runtime alias tests like for
 438                 .. = a[i];
 439                 .. = a[i+1];
 440                 a[i] = ..;
 441                 a[i+1] = ..;
 442                 *p = ..;
 443                 .. = a[i];
 444                 .. = a[i+1];
 445              where we will end up loading { a[i], a[i+1] } once, make
 446              sure that inserting group loads before the first load and
 447              stores after the last store will do the right thing.
 448              Similar for groups like
 449                 a[i] = ...;
 450                 ... = a[i];
 451                 a[i+1] = ...;
 452              where loads from the group interleave with the store.  */
 453           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 454             return opt_result::failure_at (stmtinfo_a->stmt,
 455                                            "READ_WRITE dependence"
 456                                            " in interleaving.\n");
 457
 458           if (loop->safelen < 2)
 459             {
 460               tree indicator = dr_zero_step_indicator (dra);
 461               if (!indicator || integer_zerop (indicator))
 462                 return opt_result::failure_at (stmtinfo_a->stmt,
 463                                                "access also has a zero step\n");
 464               else if (TREE_CODE (indicator) != INTEGER_CST)
 465                 vect_check_nonzero_value (loop_vinfo, indicator);
 466             }
 467           continue;
 468         }
 469
 470       if (dist > 0 && DDR_REVERSED_P (ddr))
 471         {
 472           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 473              reversed (to make distance vector positive), and the actual
 474              distance is negative.  */
 475           if (dump_enabled_p ())
 476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                              "dependence distance negative.\n");
 478           /* Record a negative dependence distance to later limit the
 479              amount of stmt copying / unrolling we can perform.
 480              Only need to handle read-after-write dependence.  */
 481           if (DR_IS_READ (drb)
 482               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 483                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 484             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 485           continue;
 486         }
 487
 488       unsigned int abs_dist = abs (dist);
 489       if (abs_dist >= 2 && abs_dist < *max_vf)
 490         {
 491           /* The dependence distance requires reduction of the maximal
 492              vectorization factor.  */
 493           *max_vf = abs (dist);
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location,
 496                              "adjusting maximal vectorization factor to %i\n",
 497                              *max_vf);
 498         }
 499
 500       if (abs_dist >= *max_vf)
 501         {
 502           /* Dependence distance does not create dependence, as far as
 503              vectorization is concerned, in this case.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "dependence distance >= VF.\n");
 507           continue;
 508         }
 509
 510       return opt_result::failure_at (stmtinfo_a->stmt,
 511                                      "not vectorized, possible dependence "
 512                                      "between data-refs %T and %T\n",
 513                                      DR_REF (dra), DR_REF (drb));
 514     }
 515
 516   return opt_result::success ();
 517 }
 518
 519 /* Function vect_analyze_data_ref_dependences.
 520
 521    Examine all the data references in the loop, and make sure there do not
 522    exist any data dependences between them.  Set *MAX_VF according to
 523    the maximum vectorization factor the data dependences allow.  */
 524
 525 opt_result
 526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 527                                    unsigned int *max_vf)
 528 {
 529   unsigned int i;
 530   struct data_dependence_relation *ddr;
 531
 532   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 533
 534   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 535     {
 536       LOOP_VINFO_DDRS (loop_vinfo)
 537         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 538                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 539       /* We need read-read dependences to compute
 540          STMT_VINFO_SAME_ALIGN_REFS.  */
 541       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 542                                           &LOOP_VINFO_DDRS (loop_vinfo),
 543                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 544                                           true);
 545       gcc_assert (res);
 546     }
 547
 548   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 549
 550   /* For epilogues we either have no aliases or alias versioning
 551      was applied to original loop.  Therefore we may just get max_vf
 552      using VF of original loop.  */
 553   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 554     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 555   else
 556     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 557       {
 558         opt_result res
 559           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 560         if (!res)
 561           return res;
 562       }
 563
 564   return opt_result::success ();
 565 }
 566
 567
 568 /* Function vect_slp_analyze_data_ref_dependence.
 569
 570    Return TRUE if there (might) exist a dependence between a memory-reference
 571    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 572    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 573    according to the data dependence.  */
 574
 575 static bool
 576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 577                                       struct data_dependence_relation *ddr)
 578 {
 579   struct data_reference *dra = DDR_A (ddr);
 580   struct data_reference *drb = DDR_B (ddr);
 581   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 582   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 583
 584   /* We need to check dependences of statements marked as unvectorizable
 585      as well, they still can prohibit vectorization.  */
 586
 587   /* Independent data accesses.  */
 588   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 589     return false;
 590
 591   if (dra == drb)
 592     return false;
 593
 594   /* Read-read is OK.  */
 595   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 596     return false;
 597
 598   /* If dra and drb are part of the same interleaving chain consider
 599      them independent.  */
 600   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 601       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 602           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 603     return false;
 604
 605   /* Unknown data dependence.  */
 606   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 607     {
 608       if  (dump_enabled_p ())
 609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 610                          "can't determine dependence between %T and %T\n",
 611                          DR_REF (dra), DR_REF (drb));
 612     }
 613   else if (dump_enabled_p ())
 614     dump_printf_loc (MSG_NOTE, vect_location,
 615                      "determined dependence between %T and %T\n",
 616                      DR_REF (dra), DR_REF (drb));
 617
 618   return true;
 619 }
 620
 621
 622 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 623    contain the vector of scalar stores of this instance if we are
 624    disambiguating the loads.  */
 625
 626 static bool
 627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 628                                    vec<stmt_vec_info> stores,
 629                                    stmt_vec_info last_store_info)
 630 {
 631   /* This walks over all stmts involved in the SLP load/store done
 632      in NODE verifying we can sink them up to the last stmt in the
 633      group.  */
 634   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 635   vec_info *vinfo = last_access_info->vinfo;
 636   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 637     {
 638       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 639       if (access_info == last_access_info)
 640         continue;
 641       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 642       ao_ref ref;
 643       bool ref_initialized_p = false;
 644       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 645            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 646         {
 647           gimple *stmt = gsi_stmt (gsi);
 648           if (! gimple_vuse (stmt)
 649               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 650             continue;
 651
 652           /* If we couldn't record a (single) data reference for this
 653              stmt we have to resort to the alias oracle.  */
 654           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 655           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 656           if (!dr_b)
 657             {
 658               /* We are moving a store or sinking a load - this means
 659                  we cannot use TBAA for disambiguation.  */
 660               if (!ref_initialized_p)
 661                 ao_ref_init (&ref, DR_REF (dr_a));
 662               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 663                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 664                 return false;
 665               continue;
 666             }
 667
 668           bool dependent = false;
 669           /* If we run into a store of this same instance (we've just
 670              marked those) then delay dependence checking until we run
 671              into the last store because this is where it will have
 672              been sunk to (and we verify if we can do that as well).  */
 673           if (gimple_visited_p (stmt))
 674             {
 675               if (stmt_info != last_store_info)
 676                 continue;
 677               unsigned i;
 678               stmt_vec_info store_info;
 679               FOR_EACH_VEC_ELT (stores, i, store_info)
 680                 {
 681                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 682                   ddr_p ddr = initialize_data_dependence_relation
 683                                 (dr_a, store_dr, vNULL);
 684                   dependent
 685                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 686                   free_dependence_relation (ddr);
 687                   if (dependent)
 688                     break;
 689                 }
 690             }
 691           else
 692             {
 693               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 694                                                                dr_b, vNULL);
 695               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 696               free_dependence_relation (ddr);
 697             }
 698           if (dependent)
 699             return false;
 700         }
 701     }
 702   return true;
 703 }
 704
 705
 706 /* Function vect_analyze_data_ref_dependences.
 707
 708    Examine all the data references in the basic-block, and make sure there
 709    do not exist any data dependences between them.  Set *MAX_VF according to
 710    the maximum vectorization factor the data dependences allow.  */
 711
 712 bool
 713 vect_slp_analyze_instance_dependence (slp_instance instance)
 714 {
 715   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 716
 717   /* The stores of this instance are at the root of the SLP tree.  */
 718   slp_tree store = SLP_INSTANCE_TREE (instance);
 719   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 720     store = NULL;
 721
 722   /* Verify we can sink stores to the vectorized stmt insert location.  */
 723   stmt_vec_info last_store_info = NULL;
 724   if (store)
 725     {
 726       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 727         return false;
 728
 729       /* Mark stores in this instance and remember the last one.  */
 730       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 731       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 732         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 733     }
 734
 735   bool res = true;
 736
 737   /* Verify we can sink loads to the vectorized stmt insert location,
 738      special-casing stores of this instance.  */
 739   slp_tree load;
 740   unsigned int i;
 741   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 742     if (! vect_slp_analyze_node_dependences (instance, load,
 743                                              store
 744                                              ? SLP_TREE_SCALAR_STMTS (store)
 745                                              : vNULL, last_store_info))
 746       {
 747         res = false;
 748         break;
 749       }
 750
 751   /* Unset the visited flag.  */
 752   if (store)
 753     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 754       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 755
 756   return res;
 757 }
 758
 759 /* Record the base alignment guarantee given by DRB, which occurs
 760    in STMT_INFO.  */
 761
 762 static void
 763 vect_record_base_alignment (stmt_vec_info stmt_info,
 764                             innermost_loop_behavior *drb)
 765 {
 766   vec_info *vinfo = stmt_info->vinfo;
 767   bool existed;
 768   innermost_loop_behavior *&entry
 769     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 770   if (!existed || entry->base_alignment < drb->base_alignment)
 771     {
 772       entry = drb;
 773       if (dump_enabled_p ())
 774         dump_printf_loc (MSG_NOTE, vect_location,
 775                          "recording new base alignment for %T\n"
 776                          "  alignment:    %d\n"
 777                          "  misalignment: %d\n"
 778                          "  based on:     %G",
 779                          drb->base_address,
 780                          drb->base_alignment,
 781                          drb->base_misalignment,
 782                          stmt_info->stmt);
 783     }
 784 }
 785
 786 /* If the region we're going to vectorize is reached, all unconditional
 787    data references occur at least once.  We can therefore pool the base
 788    alignment guarantees from each unconditional reference.  Do this by
 789    going through all the data references in VINFO and checking whether
 790    the containing statement makes the reference unconditionally.  If so,
 791    record the alignment of the base address in VINFO so that it can be
 792    used for all other references with the same base.  */
 793
 794 void
 795 vect_record_base_alignments (vec_info *vinfo)
 796 {
 797   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 798   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 799   data_reference *dr;
 800   unsigned int i;
 801   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 802     {
 803       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 804       stmt_vec_info stmt_info = dr_info->stmt;
 805       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 806           && STMT_VINFO_VECTORIZABLE (stmt_info)
 807           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 808         {
 809           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 810
 811           /* If DR is nested in the loop that is being vectorized, we can also
 812              record the alignment of the base wrt the outer loop.  */
 813           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 814             vect_record_base_alignment
 815               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 816         }
 817     }
 818 }
 819
 820 /* Return the target alignment for the vectorized form of DR_INFO.  */
 821
 822 static poly_uint64
 823 vect_calculate_target_alignment (dr_vec_info *dr_info)
 824 {
 825   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 826   return targetm.vectorize.preferred_vector_alignment (vectype);
 827 }
 828
 829 /* Function vect_compute_data_ref_alignment
 830
 831    Compute the misalignment of the data reference DR_INFO.
 832
 833    Output:
 834    1. DR_MISALIGNMENT (DR_INFO) is defined.
 835
 836    FOR NOW: No analysis is actually performed. Misalignment is calculated
 837    only for trivial cases. TODO.  */
 838
 839 static void
 840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 841 {
 842   stmt_vec_info stmt_info = dr_info->stmt;
 843   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 844   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 845   struct loop *loop = NULL;
 846   tree ref = DR_REF (dr_info->dr);
 847   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 848
 849   if (dump_enabled_p ())
 850     dump_printf_loc (MSG_NOTE, vect_location,
 851                      "vect_compute_data_ref_alignment:\n");
 852
 853   if (loop_vinfo)
 854     loop = LOOP_VINFO_LOOP (loop_vinfo);
 855
 856   /* Initialize misalignment to unknown.  */
 857   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 858
 859   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 860     return;
 861
 862   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 863   bool step_preserves_misalignment_p;
 864
 865   poly_uint64 vector_alignment
 866     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 867   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 868
 869   unsigned HOST_WIDE_INT vect_align_c;
 870   if (!vector_alignment.is_constant (&vect_align_c))
 871     return;
 872
 873   /* No step for BB vectorization.  */
 874   if (!loop)
 875     {
 876       gcc_assert (integer_zerop (drb->step));
 877       step_preserves_misalignment_p = true;
 878     }
 879
 880   /* In case the dataref is in an inner-loop of the loop that is being
 881      vectorized (LOOP), we use the base and misalignment information
 882      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 883      stays the same throughout the execution of the inner-loop, which is why
 884      we have to check that the stride of the dataref in the inner-loop evenly
 885      divides by the vector alignment.  */
 886   else if (nested_in_vect_loop_p (loop, stmt_info))
 887     {
 888       step_preserves_misalignment_p
 889         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 890
 891       if (dump_enabled_p ())
 892         {
 893           if (step_preserves_misalignment_p)
 894             dump_printf_loc (MSG_NOTE, vect_location,
 895                              "inner step divides the vector alignment.\n");
 896           else
 897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                              "inner step doesn't divide the vector"
 899                              " alignment.\n");
 900         }
 901     }
 902
 903   /* Similarly we can only use base and misalignment information relative to
 904      an innermost loop if the misalignment stays the same throughout the
 905      execution of the loop.  As above, this is the case if the stride of
 906      the dataref evenly divides by the alignment.  */
 907   else
 908     {
 909       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 910       step_preserves_misalignment_p
 911         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 912
 913       if (!step_preserves_misalignment_p && dump_enabled_p ())
 914         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                          "step doesn't divide the vector alignment.\n");
 916     }
 917
 918   unsigned int base_alignment = drb->base_alignment;
 919   unsigned int base_misalignment = drb->base_misalignment;
 920
 921   /* Calculate the maximum of the pooled base address alignment and the
 922      alignment that we can compute for DR itself.  */
 923   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 924   if (entry && base_alignment < (*entry)->base_alignment)
 925     {
 926       base_alignment = (*entry)->base_alignment;
 927       base_misalignment = (*entry)->base_misalignment;
 928     }
 929
 930   if (drb->offset_alignment < vect_align_c
 931       || !step_preserves_misalignment_p
 932       /* We need to know whether the step wrt the vectorized loop is
 933          negative when computing the starting misalignment below.  */
 934       || TREE_CODE (drb->step) != INTEGER_CST)
 935     {
 936       if (dump_enabled_p ())
 937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 938                          "Unknown alignment for access: %T\n", ref);
 939       return;
 940     }
 941
 942   if (base_alignment < vect_align_c)
 943     {
 944       unsigned int max_alignment;
 945       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 946       if (max_alignment < vect_align_c
 947           || !vect_can_force_dr_alignment_p (base,
 948                                              vect_align_c * BITS_PER_UNIT))
 949         {
 950           if (dump_enabled_p ())
 951             dump_printf_loc (MSG_NOTE, vect_location,
 952                              "can't force alignment of ref: %T\n", ref);
 953           return;
 954         }
 955
 956       /* Force the alignment of the decl.
 957          NOTE: This is the only change to the code we make during
 958          the analysis phase, before deciding to vectorize the loop.  */
 959       if (dump_enabled_p ())
 960         dump_printf_loc (MSG_NOTE, vect_location,
 961                          "force alignment of %T\n", ref);
 962
 963       dr_info->base_decl = base;
 964       dr_info->base_misaligned = true;
 965       base_misalignment = 0;
 966     }
 967   poly_int64 misalignment
 968     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 969
 970   /* If this is a backward running DR then first access in the larger
 971      vectype actually is N-1 elements before the address in the DR.
 972      Adjust misalign accordingly.  */
 973   if (tree_int_cst_sgn (drb->step) < 0)
 974     /* PLUS because STEP is negative.  */
 975     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 976                      * TREE_INT_CST_LOW (drb->step));
 977
 978   unsigned int const_misalignment;
 979   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
 980     {
 981       if (dump_enabled_p ())
 982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 983                          "Non-constant misalignment for access: %T\n", ref);
 984       return;
 985     }
 986
 987   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
 988
 989   if (dump_enabled_p ())
 990     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 991                      "misalign = %d bytes of ref %T\n",
 992                      DR_MISALIGNMENT (dr_info), ref);
 993
 994   return;
 995 }
 996
 997 /* Function vect_update_misalignment_for_peel.
 998    Sets DR_INFO's misalignment
 999    - to 0 if it has the same alignment as DR_PEEL_INFO,
1000    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1001    - to -1 (unknown) otherwise.
1002
1003    DR_INFO - the data reference whose misalignment is to be adjusted.
1004    DR_PEEL_INFO - the data reference whose misalignment is being made
1005                   zero in the vector loop by the peel.
1006    NPEEL - the number of iterations in the peel loop if the misalignment
1007            of DR_PEEL_INFO is known at compile time.  */
1008
1009 static void
1010 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1011                                    dr_vec_info *dr_peel_info, int npeel)
1012 {
1013   unsigned int i;
1014   vec<dr_p> same_aligned_drs;
1015   struct data_reference *current_dr;
1016   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1017
1018   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1019      it is aligned in the vector loop.  */
1020   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1021   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1022     {
1023       if (current_dr != dr_info->dr)
1024         continue;
1025       gcc_assert (!known_alignment_for_access_p (dr_info)
1026                   || !known_alignment_for_access_p (dr_peel_info)
1027                   || (DR_MISALIGNMENT (dr_info)
1028                       == DR_MISALIGNMENT (dr_peel_info)));
1029       SET_DR_MISALIGNMENT (dr_info, 0);
1030       return;
1031     }
1032
1033   unsigned HOST_WIDE_INT alignment;
1034   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1035       && known_alignment_for_access_p (dr_info)
1036       && known_alignment_for_access_p (dr_peel_info))
1037     {
1038       int misal = DR_MISALIGNMENT (dr_info);
1039       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1040       misal &= alignment - 1;
1041       SET_DR_MISALIGNMENT (dr_info, misal);
1042       return;
1043     }
1044
1045   if (dump_enabled_p ())
1046     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1047                      "to unknown (-1).\n");
1048   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1049 }
1050
1051
1052 /* Function verify_data_ref_alignment
1053
1054    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1055
1056 static opt_result
1057 verify_data_ref_alignment (dr_vec_info *dr_info)
1058 {
1059   enum dr_alignment_support supportable_dr_alignment
1060     = vect_supportable_dr_alignment (dr_info, false);
1061   if (!supportable_dr_alignment)
1062     return opt_result::failure_at
1063       (dr_info->stmt->stmt,
1064        DR_IS_READ (dr_info->dr)
1065         ? "not vectorized: unsupported unaligned load: %T\n"
1066         : "not vectorized: unsupported unaligned store: %T\n",
1067        DR_REF (dr_info->dr));
1068
1069   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1070     dump_printf_loc (MSG_NOTE, vect_location,
1071                      "Vectorizing an unaligned access.\n");
1072
1073   return opt_result::success ();
1074 }
1075
1076 /* Function vect_verify_datarefs_alignment
1077
1078    Return TRUE if all data references in the loop can be
1079    handled with respect to alignment.  */
1080
1081 opt_result
1082 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1083 {
1084   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1085   struct data_reference *dr;
1086   unsigned int i;
1087
1088   FOR_EACH_VEC_ELT (datarefs, i, dr)
1089     {
1090       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1091       stmt_vec_info stmt_info = dr_info->stmt;
1092
1093       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1094         continue;
1095
1096       /* For interleaving, only the alignment of the first access matters.   */
1097       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1098           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1099         continue;
1100
1101       /* Strided accesses perform only component accesses, alignment is
1102          irrelevant for them.  */
1103       if (STMT_VINFO_STRIDED_P (stmt_info)
1104           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1105         continue;
1106
1107       opt_result res = verify_data_ref_alignment (dr_info);
1108       if (!res)
1109         return res;
1110     }
1111
1112   return opt_result::success ();
1113 }
1114
1115 /* Given an memory reference EXP return whether its alignment is less
1116    than its size.  */
1117
1118 static bool
1119 not_size_aligned (tree exp)
1120 {
1121   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1122     return true;
1123
1124   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1125           > get_object_alignment (exp));
1126 }
1127
1128 /* Function vector_alignment_reachable_p
1129
1130    Return true if vector alignment for DR_INFO is reachable by peeling
1131    a few loop iterations.  Return false otherwise.  */
1132
1133 static bool
1134 vector_alignment_reachable_p (dr_vec_info *dr_info)
1135 {
1136   stmt_vec_info stmt_info = dr_info->stmt;
1137   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1138
1139   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1140     {
1141       /* For interleaved access we peel only if number of iterations in
1142          the prolog loop ({VF - misalignment}), is a multiple of the
1143          number of the interleaved accesses.  */
1144       int elem_size, mis_in_elements;
1145
1146       /* FORNOW: handle only known alignment.  */
1147       if (!known_alignment_for_access_p (dr_info))
1148         return false;
1149
1150       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1151       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1152       elem_size = vector_element_size (vector_size, nelements);
1153       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1154
1155       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1156         return false;
1157     }
1158
1159   /* If misalignment is known at the compile time then allow peeling
1160      only if natural alignment is reachable through peeling.  */
1161   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1162     {
1163       HOST_WIDE_INT elmsize =
1164                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1165       if (dump_enabled_p ())
1166         {
1167           dump_printf_loc (MSG_NOTE, vect_location,
1168                            "data size = %wd. misalignment = %d.\n", elmsize,
1169                            DR_MISALIGNMENT (dr_info));
1170         }
1171       if (DR_MISALIGNMENT (dr_info) % elmsize)
1172         {
1173           if (dump_enabled_p ())
1174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175                              "data size does not divide the misalignment.\n");
1176           return false;
1177         }
1178     }
1179
1180   if (!known_alignment_for_access_p (dr_info))
1181     {
1182       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1183       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1184       if (dump_enabled_p ())
1185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                          "Unknown misalignment, %snaturally aligned\n",
1187                          is_packed ? "not " : "");
1188       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1189     }
1190
1191   return true;
1192 }
1193
1194
1195 /* Calculate the cost of the memory access represented by DR_INFO.  */
1196
1197 static void
1198 vect_get_data_access_cost (dr_vec_info *dr_info,
1199                            unsigned int *inside_cost,
1200                            unsigned int *outside_cost,
1201                            stmt_vector_for_cost *body_cost_vec,
1202                            stmt_vector_for_cost *prologue_cost_vec)
1203 {
1204   stmt_vec_info stmt_info = dr_info->stmt;
1205   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206   int ncopies;
1207
1208   if (PURE_SLP_STMT (stmt_info))
1209     ncopies = 1;
1210   else
1211     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1212
1213   if (DR_IS_READ (dr_info->dr))
1214     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1215                         prologue_cost_vec, body_cost_vec, false);
1216   else
1217     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1218
1219   if (dump_enabled_p ())
1220     dump_printf_loc (MSG_NOTE, vect_location,
1221                      "vect_get_data_access_cost: inside_cost = %d, "
1222                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1223 }
1224
1225
1226 typedef struct _vect_peel_info
1227 {
1228   dr_vec_info *dr_info;
1229   int npeel;
1230   unsigned int count;
1231 } *vect_peel_info;
1232
1233 typedef struct _vect_peel_extended_info
1234 {
1235   struct _vect_peel_info peel_info;
1236   unsigned int inside_cost;
1237   unsigned int outside_cost;
1238 } *vect_peel_extended_info;
1239
1240
1241 /* Peeling hashtable helpers.  */
1242
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1244 {
1245   static inline hashval_t hash (const _vect_peel_info *);
1246   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1247 };
1248
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1251 {
1252   return (hashval_t) peel_info->npeel;
1253 }
1254
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1257 {
1258   return (a->npeel == b->npeel);
1259 }
1260
1261
1262 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1263
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1267                           int npeel)
1268 {
1269   struct _vect_peel_info elem, *slot;
1270   _vect_peel_info **new_slot;
1271   bool supportable_dr_alignment
1272     = vect_supportable_dr_alignment (dr_info, true);
1273
1274   elem.npeel = npeel;
1275   slot = peeling_htab->find (&elem);
1276   if (slot)
1277     slot->count++;
1278   else
1279     {
1280       slot = XNEW (struct _vect_peel_info);
1281       slot->npeel = npeel;
1282       slot->dr_info = dr_info;
1283       slot->count = 1;
1284       new_slot = peeling_htab->find_slot (slot, INSERT);
1285       *new_slot = slot;
1286     }
1287
1288   if (!supportable_dr_alignment
1289       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1290     slot->count += VECT_MAX_COST;
1291 }
1292
1293
1294 /* Traverse peeling hash table to find peeling option that aligns maximum
1295    number of data accesses.  */
1296
1297 int
1298 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1299                                      _vect_peel_extended_info *max)
1300 {
1301   vect_peel_info elem = *slot;
1302
1303   if (elem->count > max->peel_info.count
1304       || (elem->count == max->peel_info.count
1305           && max->peel_info.npeel > elem->npeel))
1306     {
1307       max->peel_info.npeel = elem->npeel;
1308       max->peel_info.count = elem->count;
1309       max->peel_info.dr_info = elem->dr_info;
1310     }
1311
1312   return 1;
1313 }
1314
1315 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1316    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1317    we assume DR0_INFO's misalignment will be zero after peeling.  */
1318
1319 static void
1320 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1321                                 dr_vec_info *dr0_info,
1322                                 unsigned int *inside_cost,
1323                                 unsigned int *outside_cost,
1324                                 stmt_vector_for_cost *body_cost_vec,
1325                                 stmt_vector_for_cost *prologue_cost_vec,
1326                                 unsigned int npeel,
1327                                 bool unknown_misalignment)
1328 {
1329   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1330   unsigned i;
1331   data_reference *dr;
1332
1333   FOR_EACH_VEC_ELT (datarefs, i, dr)
1334     {
1335       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1336       stmt_vec_info stmt_info = dr_info->stmt;
1337       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1338         continue;
1339
1340       /* For interleaving, only the alignment of the first access
1341          matters.  */
1342       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1344         continue;
1345
1346       /* Strided accesses perform only component accesses, alignment is
1347          irrelevant for them.  */
1348       if (STMT_VINFO_STRIDED_P (stmt_info)
1349           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350         continue;
1351
1352       int save_misalignment;
1353       save_misalignment = DR_MISALIGNMENT (dr_info);
1354       if (npeel == 0)
1355         ;
1356       else if (unknown_misalignment && dr_info == dr0_info)
1357         SET_DR_MISALIGNMENT (dr_info, 0);
1358       else
1359         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1360       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1361                                  body_cost_vec, prologue_cost_vec);
1362       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1363     }
1364 }
1365
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367    Find the one with the lowest cost.  */
1368
1369 int
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1371                                    _vect_peel_extended_info *min)
1372 {
1373   vect_peel_info elem = *slot;
1374   int dummy;
1375   unsigned int inside_cost = 0, outside_cost = 0;
1376   stmt_vec_info stmt_info = elem->dr_info->stmt;
1377   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1378   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1379                        epilogue_cost_vec;
1380
1381   prologue_cost_vec.create (2);
1382   body_cost_vec.create (2);
1383   epilogue_cost_vec.create (2);
1384
1385   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1386                                   &outside_cost, &body_cost_vec,
1387                                   &prologue_cost_vec, elem->npeel, false);
1388
1389   body_cost_vec.release ();
1390
1391   outside_cost += vect_get_known_peeling_cost
1392     (loop_vinfo, elem->npeel, &dummy,
1393      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1394      &prologue_cost_vec, &epilogue_cost_vec);
1395
1396   /* Prologue and epilogue costs are added to the target model later.
1397      These costs depend only on the scalar iteration cost, the
1398      number of peeling iterations finally chosen, and the number of
1399      misaligned statements.  So discard the information found here.  */
1400   prologue_cost_vec.release ();
1401   epilogue_cost_vec.release ();
1402
1403   if (inside_cost < min->inside_cost
1404       || (inside_cost == min->inside_cost
1405           && outside_cost < min->outside_cost))
1406     {
1407       min->inside_cost = inside_cost;
1408       min->outside_cost = outside_cost;
1409       min->peel_info.dr_info = elem->dr_info;
1410       min->peel_info.npeel = elem->npeel;
1411       min->peel_info.count = elem->count;
1412     }
1413
1414   return 1;
1415 }
1416
1417
1418 /* Choose best peeling option by traversing peeling hash table and either
1419    choosing an option with the lowest cost (if cost model is enabled) or the
1420    option that aligns as many accesses as possible.  */
1421
1422 static struct _vect_peel_extended_info
1423 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1424                                        loop_vec_info loop_vinfo)
1425 {
1426    struct _vect_peel_extended_info res;
1427
1428    res.peel_info.dr_info = NULL;
1429
1430    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1431      {
1432        res.inside_cost = INT_MAX;
1433        res.outside_cost = INT_MAX;
1434        peeling_htab->traverse <_vect_peel_extended_info *,
1435                                vect_peeling_hash_get_lowest_cost> (&res);
1436      }
1437    else
1438      {
1439        res.peel_info.count = 0;
1440        peeling_htab->traverse <_vect_peel_extended_info *,
1441                                vect_peeling_hash_get_most_frequent> (&res);
1442        res.inside_cost = 0;
1443        res.outside_cost = 0;
1444      }
1445
1446    return res;
1447 }
1448
1449 /* Return true if the new peeling NPEEL is supported.  */
1450
1451 static bool
1452 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1453                           unsigned npeel)
1454 {
1455   unsigned i;
1456   struct data_reference *dr = NULL;
1457   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1458   enum dr_alignment_support supportable_dr_alignment;
1459
1460   /* Ensure that all data refs can be vectorized after the peel.  */
1461   FOR_EACH_VEC_ELT (datarefs, i, dr)
1462     {
1463       int save_misalignment;
1464
1465       if (dr == dr0_info->dr)
1466         continue;
1467
1468       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1469       stmt_vec_info stmt_info = dr_info->stmt;
1470       /* For interleaving, only the alignment of the first access
1471          matters.  */
1472       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1473           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1474         continue;
1475
1476       /* Strided accesses perform only component accesses, alignment is
1477          irrelevant for them.  */
1478       if (STMT_VINFO_STRIDED_P (stmt_info)
1479           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1480         continue;
1481
1482       save_misalignment = DR_MISALIGNMENT (dr_info);
1483       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1484       supportable_dr_alignment
1485         = vect_supportable_dr_alignment (dr_info, false);
1486       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1487
1488       if (!supportable_dr_alignment)
1489         return false;
1490     }
1491
1492   return true;
1493 }
1494
1495 /* Function vect_enhance_data_refs_alignment
1496
1497    This pass will use loop versioning and loop peeling in order to enhance
1498    the alignment of data references in the loop.
1499
1500    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1501    original loop is to be vectorized.  Any other loops that are created by
1502    the transformations performed in this pass - are not supposed to be
1503    vectorized.  This restriction will be relaxed.
1504
1505    This pass will require a cost model to guide it whether to apply peeling
1506    or versioning or a combination of the two.  For example, the scheme that
1507    intel uses when given a loop with several memory accesses, is as follows:
1508    choose one memory access ('p') which alignment you want to force by doing
1509    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1510    other accesses are not necessarily aligned, or (2) use loop versioning to
1511    generate one loop in which all accesses are aligned, and another loop in
1512    which only 'p' is necessarily aligned.
1513
1514    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1515    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1516    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1517
1518    Devising a cost model is the most critical aspect of this work.  It will
1519    guide us on which access to peel for, whether to use loop versioning, how
1520    many versions to create, etc.  The cost model will probably consist of
1521    generic considerations as well as target specific considerations (on
1522    powerpc for example, misaligned stores are more painful than misaligned
1523    loads).
1524
1525    Here are the general steps involved in alignment enhancements:
1526
1527      -- original loop, before alignment analysis:
1528         for (i=0; i<N; i++){
1529           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1530           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1531         }
1532
1533      -- After vect_compute_data_refs_alignment:
1534         for (i=0; i<N; i++){
1535           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1536           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1537         }
1538
1539      -- Possibility 1: we do loop versioning:
1540      if (p is aligned) {
1541         for (i=0; i<N; i++){    # loop 1A
1542           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1543           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1544         }
1545      }
1546      else {
1547         for (i=0; i<N; i++){    # loop 1B
1548           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1549           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1550         }
1551      }
1552
1553      -- Possibility 2: we do loop peeling:
1554      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1555         x = q[i];
1556         p[i] = y;
1557      }
1558      for (i = 3; i < N; i++){   # loop 2A
1559         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1560         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1561      }
1562
1563      -- Possibility 3: combination of loop peeling and versioning:
1564      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1565         x = q[i];
1566         p[i] = y;
1567      }
1568      if (p is aligned) {
1569         for (i = 3; i<N; i++){  # loop 3A
1570           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1571           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1572         }
1573      }
1574      else {
1575         for (i = 3; i<N; i++){  # loop 3B
1576           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1577           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1578         }
1579      }
1580
1581      These loops are later passed to loop_transform to be vectorized.  The
1582      vectorizer will use the alignment information to guide the transformation
1583      (whether to generate regular loads/stores, or with special handling for
1584      misalignment).  */
1585
1586 opt_result
1587 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1588 {
1589   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1590   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1591   enum dr_alignment_support supportable_dr_alignment;
1592   dr_vec_info *first_store = NULL;
1593   dr_vec_info *dr0_info = NULL;
1594   struct data_reference *dr;
1595   unsigned int i, j;
1596   bool do_peeling = false;
1597   bool do_versioning = false;
1598   unsigned int npeel = 0;
1599   bool one_misalignment_known = false;
1600   bool one_misalignment_unknown = false;
1601   bool one_dr_unsupportable = false;
1602   dr_vec_info *unsupportable_dr_info = NULL;
1603   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1604   unsigned possible_npeel_number = 1;
1605   tree vectype;
1606   unsigned int mis, same_align_drs_max = 0;
1607   hash_table<peel_info_hasher> peeling_htab (1);
1608
1609   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1610
1611   /* Reset data so we can safely be called multiple times.  */
1612   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1613   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1614
1615   /* While cost model enhancements are expected in the future, the high level
1616      view of the code at this time is as follows:
1617
1618      A) If there is a misaligned access then see if peeling to align
1619         this access can make all data references satisfy
1620         vect_supportable_dr_alignment.  If so, update data structures
1621         as needed and return true.
1622
1623      B) If peeling wasn't possible and there is a data reference with an
1624         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1625         then see if loop versioning checks can be used to make all data
1626         references satisfy vect_supportable_dr_alignment.  If so, update
1627         data structures as needed and return true.
1628
1629      C) If neither peeling nor versioning were successful then return false if
1630         any data reference does not satisfy vect_supportable_dr_alignment.
1631
1632      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1633
1634      Note, Possibility 3 above (which is peeling and versioning together) is not
1635      being done at this time.  */
1636
1637   /* (1) Peeling to force alignment.  */
1638
1639   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1640      Considerations:
1641      + How many accesses will become aligned due to the peeling
1642      - How many accesses will become unaligned due to the peeling,
1643        and the cost of misaligned accesses.
1644      - The cost of peeling (the extra runtime checks, the increase
1645        in code size).  */
1646
1647   FOR_EACH_VEC_ELT (datarefs, i, dr)
1648     {
1649       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1650       stmt_vec_info stmt_info = dr_info->stmt;
1651
1652       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1653         continue;
1654
1655       /* For interleaving, only the alignment of the first access
1656          matters.  */
1657       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1658           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1659         continue;
1660
1661       /* For scatter-gather or invariant accesses there is nothing
1662          to enhance.  */
1663       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1664           || integer_zerop (DR_STEP (dr)))
1665         continue;
1666
1667       /* Strided accesses perform only component accesses, alignment is
1668          irrelevant for them.  */
1669       if (STMT_VINFO_STRIDED_P (stmt_info)
1670           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1671         continue;
1672
1673       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1674       do_peeling = vector_alignment_reachable_p (dr_info);
1675       if (do_peeling)
1676         {
1677           if (known_alignment_for_access_p (dr_info))
1678             {
1679               unsigned int npeel_tmp = 0;
1680               bool negative = tree_int_cst_compare (DR_STEP (dr),
1681                                                     size_zero_node) < 0;
1682
1683               vectype = STMT_VINFO_VECTYPE (stmt_info);
1684               /* If known_alignment_for_access_p then we have set
1685                  DR_MISALIGNMENT which is only done if we know it at compiler
1686                  time, so it is safe to assume target alignment is constant.
1687                */
1688               unsigned int target_align =
1689                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1690               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1691               mis = (negative
1692                      ? DR_MISALIGNMENT (dr_info)
1693                      : -DR_MISALIGNMENT (dr_info));
1694               if (DR_MISALIGNMENT (dr_info) != 0)
1695                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1696
1697               /* For multiple types, it is possible that the bigger type access
1698                  will have more than one peeling option.  E.g., a loop with two
1699                  types: one of size (vector size / 4), and the other one of
1700                  size (vector size / 8).  Vectorization factor will 8.  If both
1701                  accesses are misaligned by 3, the first one needs one scalar
1702                  iteration to be aligned, and the second one needs 5.  But the
1703                  first one will be aligned also by peeling 5 scalar
1704                  iterations, and in that case both accesses will be aligned.
1705                  Hence, except for the immediate peeling amount, we also want
1706                  to try to add full vector size, while we don't exceed
1707                  vectorization factor.
1708                  We do this automatically for cost model, since we calculate
1709                  cost for every peeling option.  */
1710               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1711                 {
1712                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1713                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1714                   possible_npeel_number
1715                     = vect_get_num_vectors (nscalars, vectype);
1716
1717                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1718                      allow peeling NELEMENTS.  */
1719                   if (DR_MISALIGNMENT (dr_info) == 0)
1720                     possible_npeel_number++;
1721                 }
1722
1723               /* Save info about DR in the hash table.  Also include peeling
1724                  amounts according to the explanation above.  */
1725               for (j = 0; j < possible_npeel_number; j++)
1726                 {
1727                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1728                                             dr_info, npeel_tmp);
1729                   npeel_tmp += target_align / dr_size;
1730                 }
1731
1732               one_misalignment_known = true;
1733             }
1734           else
1735             {
1736               /* If we don't know any misalignment values, we prefer
1737                  peeling for data-ref that has the maximum number of data-refs
1738                  with the same alignment, unless the target prefers to align
1739                  stores over load.  */
1740               unsigned same_align_drs
1741                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1742               if (!dr0_info
1743                   || same_align_drs_max < same_align_drs)
1744                 {
1745                   same_align_drs_max = same_align_drs;
1746                   dr0_info = dr_info;
1747                 }
1748               /* For data-refs with the same number of related
1749                  accesses prefer the one where the misalign
1750                  computation will be invariant in the outermost loop.  */
1751               else if (same_align_drs_max == same_align_drs)
1752                 {
1753                   struct loop *ivloop0, *ivloop;
1754                   ivloop0 = outermost_invariant_loop_for_expr
1755                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1756                   ivloop = outermost_invariant_loop_for_expr
1757                     (loop, DR_BASE_ADDRESS (dr));
1758                   if ((ivloop && !ivloop0)
1759                       || (ivloop && ivloop0
1760                           && flow_loop_nested_p (ivloop, ivloop0)))
1761                     dr0_info = dr_info;
1762                 }
1763
1764               one_misalignment_unknown = true;
1765
1766               /* Check for data refs with unsupportable alignment that
1767                  can be peeled.  */
1768               if (!supportable_dr_alignment)
1769               {
1770                 one_dr_unsupportable = true;
1771                 unsupportable_dr_info = dr_info;
1772               }
1773
1774               if (!first_store && DR_IS_WRITE (dr))
1775                 first_store = dr_info;
1776             }
1777         }
1778       else
1779         {
1780           if (!aligned_access_p (dr_info))
1781             {
1782               if (dump_enabled_p ())
1783                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                                  "vector alignment may not be reachable\n");
1785               break;
1786             }
1787         }
1788     }
1789
1790   /* Check if we can possibly peel the loop.  */
1791   if (!vect_can_advance_ivs_p (loop_vinfo)
1792       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1793       || loop->inner)
1794     do_peeling = false;
1795
1796   struct _vect_peel_extended_info peel_for_known_alignment;
1797   struct _vect_peel_extended_info peel_for_unknown_alignment;
1798   struct _vect_peel_extended_info best_peel;
1799
1800   peel_for_unknown_alignment.inside_cost = INT_MAX;
1801   peel_for_unknown_alignment.outside_cost = INT_MAX;
1802   peel_for_unknown_alignment.peel_info.count = 0;
1803
1804   if (do_peeling
1805       && one_misalignment_unknown)
1806     {
1807       /* Check if the target requires to prefer stores over loads, i.e., if
1808          misaligned stores are more expensive than misaligned loads (taking
1809          drs with same alignment into account).  */
1810       unsigned int load_inside_cost = 0;
1811       unsigned int load_outside_cost = 0;
1812       unsigned int store_inside_cost = 0;
1813       unsigned int store_outside_cost = 0;
1814       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1815
1816       stmt_vector_for_cost dummy;
1817       dummy.create (2);
1818       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1819                                       &load_inside_cost,
1820                                       &load_outside_cost,
1821                                       &dummy, &dummy, estimated_npeels, true);
1822       dummy.release ();
1823
1824       if (first_store)
1825         {
1826           dummy.create (2);
1827           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1828                                           &store_inside_cost,
1829                                           &store_outside_cost,
1830                                           &dummy, &dummy,
1831                                           estimated_npeels, true);
1832           dummy.release ();
1833         }
1834       else
1835         {
1836           store_inside_cost = INT_MAX;
1837           store_outside_cost = INT_MAX;
1838         }
1839
1840       if (load_inside_cost > store_inside_cost
1841           || (load_inside_cost == store_inside_cost
1842               && load_outside_cost > store_outside_cost))
1843         {
1844           dr0_info = first_store;
1845           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1846           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1847         }
1848       else
1849         {
1850           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1851           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1852         }
1853
1854       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1855       prologue_cost_vec.create (2);
1856       epilogue_cost_vec.create (2);
1857
1858       int dummy2;
1859       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1860         (loop_vinfo, estimated_npeels, &dummy2,
1861          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1862          &prologue_cost_vec, &epilogue_cost_vec);
1863
1864       prologue_cost_vec.release ();
1865       epilogue_cost_vec.release ();
1866
1867       peel_for_unknown_alignment.peel_info.count = 1
1868         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1869     }
1870
1871   peel_for_unknown_alignment.peel_info.npeel = 0;
1872   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1873
1874   best_peel = peel_for_unknown_alignment;
1875
1876   peel_for_known_alignment.inside_cost = INT_MAX;
1877   peel_for_known_alignment.outside_cost = INT_MAX;
1878   peel_for_known_alignment.peel_info.count = 0;
1879   peel_for_known_alignment.peel_info.dr_info = NULL;
1880
1881   if (do_peeling && one_misalignment_known)
1882     {
1883       /* Peeling is possible, but there is no data access that is not supported
1884          unless aligned.  So we try to choose the best possible peeling from
1885          the hash table.  */
1886       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1887         (&peeling_htab, loop_vinfo);
1888     }
1889
1890   /* Compare costs of peeling for known and unknown alignment. */
1891   if (peel_for_known_alignment.peel_info.dr_info != NULL
1892       && peel_for_unknown_alignment.inside_cost
1893       >= peel_for_known_alignment.inside_cost)
1894     {
1895       best_peel = peel_for_known_alignment;
1896
1897       /* If the best peeling for known alignment has NPEEL == 0, perform no
1898          peeling at all except if there is an unsupportable dr that we can
1899          align.  */
1900       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1901         do_peeling = false;
1902     }
1903
1904   /* If there is an unsupportable data ref, prefer this over all choices so far
1905      since we'd have to discard a chosen peeling except when it accidentally
1906      aligned the unsupportable data ref.  */
1907   if (one_dr_unsupportable)
1908     dr0_info = unsupportable_dr_info;
1909   else if (do_peeling)
1910     {
1911       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1912          TODO: Use nopeel_outside_cost or get rid of it?  */
1913       unsigned nopeel_inside_cost = 0;
1914       unsigned nopeel_outside_cost = 0;
1915
1916       stmt_vector_for_cost dummy;
1917       dummy.create (2);
1918       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1919                                       &nopeel_outside_cost, &dummy, &dummy,
1920                                       0, false);
1921       dummy.release ();
1922
1923       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1924          costs will be recorded.  */
1925       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1926       prologue_cost_vec.create (2);
1927       epilogue_cost_vec.create (2);
1928
1929       int dummy2;
1930       nopeel_outside_cost += vect_get_known_peeling_cost
1931         (loop_vinfo, 0, &dummy2,
1932          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1933          &prologue_cost_vec, &epilogue_cost_vec);
1934
1935       prologue_cost_vec.release ();
1936       epilogue_cost_vec.release ();
1937
1938       npeel = best_peel.peel_info.npeel;
1939       dr0_info = best_peel.peel_info.dr_info;
1940
1941       /* If no peeling is not more expensive than the best peeling we
1942          have so far, don't perform any peeling.  */
1943       if (nopeel_inside_cost <= best_peel.inside_cost)
1944         do_peeling = false;
1945     }
1946
1947   if (do_peeling)
1948     {
1949       stmt_vec_info stmt_info = dr0_info->stmt;
1950       vectype = STMT_VINFO_VECTYPE (stmt_info);
1951
1952       if (known_alignment_for_access_p (dr0_info))
1953         {
1954           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1955                                                 size_zero_node) < 0;
1956           if (!npeel)
1957             {
1958               /* Since it's known at compile time, compute the number of
1959                  iterations in the peeled loop (the peeling factor) for use in
1960                  updating DR_MISALIGNMENT values.  The peeling factor is the
1961                  vectorization factor minus the misalignment as an element
1962                  count.  */
1963               mis = (negative
1964                      ? DR_MISALIGNMENT (dr0_info)
1965                      : -DR_MISALIGNMENT (dr0_info));
1966               /* If known_alignment_for_access_p then we have set
1967                  DR_MISALIGNMENT which is only done if we know it at compiler
1968                  time, so it is safe to assume target alignment is constant.
1969                */
1970               unsigned int target_align =
1971                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
1972               npeel = ((mis & (target_align - 1))
1973                        / vect_get_scalar_dr_size (dr0_info));
1974             }
1975
1976           /* For interleaved data access every iteration accesses all the
1977              members of the group, therefore we divide the number of iterations
1978              by the group size.  */
1979           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1980             npeel /= DR_GROUP_SIZE (stmt_info);
1981
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_NOTE, vect_location,
1984                              "Try peeling by %d\n", npeel);
1985         }
1986
1987       /* Ensure that all datarefs can be vectorized after the peel.  */
1988       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1989         do_peeling = false;
1990
1991       /* Check if all datarefs are supportable and log.  */
1992       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1993         {
1994           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1995           if (!stat)
1996             do_peeling = false;
1997           else
1998             return stat;
1999         }
2000
2001       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2002       if (do_peeling)
2003         {
2004           unsigned max_allowed_peel
2005             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2006           if (max_allowed_peel != (unsigned)-1)
2007             {
2008               unsigned max_peel = npeel;
2009               if (max_peel == 0)
2010                 {
2011                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2012                   unsigned HOST_WIDE_INT target_align_c;
2013                   if (target_align.is_constant (&target_align_c))
2014                     max_peel =
2015                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2016                   else
2017                     {
2018                       do_peeling = false;
2019                       if (dump_enabled_p ())
2020                         dump_printf_loc (MSG_NOTE, vect_location,
2021                           "Disable peeling, max peels set and vector"
2022                           " alignment unknown\n");
2023                     }
2024                 }
2025               if (max_peel > max_allowed_peel)
2026                 {
2027                   do_peeling = false;
2028                   if (dump_enabled_p ())
2029                     dump_printf_loc (MSG_NOTE, vect_location,
2030                         "Disable peeling, max peels reached: %d\n", max_peel);
2031                 }
2032             }
2033         }
2034
2035       /* Cost model #2 - if peeling may result in a remaining loop not
2036          iterating enough to be vectorized then do not peel.  Since this
2037          is a cost heuristic rather than a correctness decision, use the
2038          most likely runtime value for variable vectorization factors.  */
2039       if (do_peeling
2040           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2041         {
2042           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2043           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2044           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2045               < assumed_vf + max_peel)
2046             do_peeling = false;
2047         }
2048
2049       if (do_peeling)
2050         {
2051           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2052              If the misalignment of DR_i is identical to that of dr0 then set
2053              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2054              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2055              by the peeling factor times the element size of DR_i (MOD the
2056              vectorization factor times the size).  Otherwise, the
2057              misalignment of DR_i must be set to unknown.  */
2058           FOR_EACH_VEC_ELT (datarefs, i, dr)
2059             if (dr != dr0_info->dr)
2060               {
2061                 /* Strided accesses perform only component accesses, alignment
2062                    is irrelevant for them.  */
2063                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2064                 stmt_info = dr_info->stmt;
2065                 if (STMT_VINFO_STRIDED_P (stmt_info)
2066                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2067                   continue;
2068
2069                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2070               }
2071
2072           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2073           if (npeel)
2074             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2075           else
2076             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2077               = DR_MISALIGNMENT (dr0_info);
2078           SET_DR_MISALIGNMENT (dr0_info, 0);
2079           if (dump_enabled_p ())
2080             {
2081               dump_printf_loc (MSG_NOTE, vect_location,
2082                                "Alignment of access forced using peeling.\n");
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Peeling for alignment will be applied.\n");
2085             }
2086
2087           /* The inside-loop cost will be accounted for in vectorizable_load
2088              and vectorizable_store correctly with adjusted alignments.
2089              Drop the body_cst_vec on the floor here.  */
2090           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2091           gcc_assert (stat);
2092           return stat;
2093         }
2094     }
2095
2096   /* (2) Versioning to force alignment.  */
2097
2098   /* Try versioning if:
2099      1) optimize loop for speed
2100      2) there is at least one unsupported misaligned data ref with an unknown
2101         misalignment, and
2102      3) all misaligned data refs with a known misalignment are supported, and
2103      4) the number of runtime alignment checks is within reason.  */
2104
2105   do_versioning =
2106         optimize_loop_nest_for_speed_p (loop)
2107         && (!loop->inner); /* FORNOW */
2108
2109   if (do_versioning)
2110     {
2111       FOR_EACH_VEC_ELT (datarefs, i, dr)
2112         {
2113           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2114           stmt_vec_info stmt_info = dr_info->stmt;
2115
2116           /* For interleaving, only the alignment of the first access
2117              matters.  */
2118           if (aligned_access_p (dr_info)
2119               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2120                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2121             continue;
2122
2123           if (STMT_VINFO_STRIDED_P (stmt_info))
2124             {
2125               /* Strided loads perform only component accesses, alignment is
2126                  irrelevant for them.  */
2127               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2128                 continue;
2129               do_versioning = false;
2130               break;
2131             }
2132
2133           supportable_dr_alignment
2134             = vect_supportable_dr_alignment (dr_info, false);
2135
2136           if (!supportable_dr_alignment)
2137             {
2138               int mask;
2139               tree vectype;
2140
2141               if (known_alignment_for_access_p (dr_info)
2142                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2143                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2144                 {
2145                   do_versioning = false;
2146                   break;
2147                 }
2148
2149               vectype = STMT_VINFO_VECTYPE (stmt_info);
2150               gcc_assert (vectype);
2151
2152               /* At present we don't support versioning for alignment
2153                  with variable VF, since there's no guarantee that the
2154                  VF is a power of two.  We could relax this if we added
2155                  a way of enforcing a power-of-two size.  */
2156               unsigned HOST_WIDE_INT size;
2157               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2158                 {
2159                   do_versioning = false;
2160                   break;
2161                 }
2162
2163               /* The rightmost bits of an aligned address must be zeros.
2164                  Construct the mask needed for this test.  For example,
2165                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2166                  mask must be 15 = 0xf. */
2167               mask = size - 1;
2168
2169               /* FORNOW: use the same mask to test all potentially unaligned
2170                  references in the loop.  The vectorizer currently supports
2171                  a single vector size, see the reference to
2172                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2173                  vectorization factor is computed.  */
2174               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2175                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2176               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2177               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2178             }
2179         }
2180
2181       /* Versioning requires at least one misaligned data reference.  */
2182       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2183         do_versioning = false;
2184       else if (!do_versioning)
2185         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2186     }
2187
2188   if (do_versioning)
2189     {
2190       vec<stmt_vec_info> may_misalign_stmts
2191         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2192       stmt_vec_info stmt_info;
2193
2194       /* It can now be assumed that the data references in the statements
2195          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2196          of the loop being vectorized.  */
2197       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2198         {
2199           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2200           SET_DR_MISALIGNMENT (dr_info, 0);
2201           if (dump_enabled_p ())
2202             dump_printf_loc (MSG_NOTE, vect_location,
2203                              "Alignment of access forced using versioning.\n");
2204         }
2205
2206       if (dump_enabled_p ())
2207         dump_printf_loc (MSG_NOTE, vect_location,
2208                          "Versioning for alignment will be applied.\n");
2209
2210       /* Peeling and versioning can't be done together at this time.  */
2211       gcc_assert (! (do_peeling && do_versioning));
2212
2213       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2214       gcc_assert (stat);
2215       return stat;
2216     }
2217
2218   /* This point is reached if neither peeling nor versioning is being done.  */
2219   gcc_assert (! (do_peeling || do_versioning));
2220
2221   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2222   return stat;
2223 }
2224
2225
2226 /* Function vect_find_same_alignment_drs.
2227
2228    Update group and alignment relations in VINFO according to the chosen
2229    vectorization factor.  */
2230
2231 static void
2232 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2233 {
2234   struct data_reference *dra = DDR_A (ddr);
2235   struct data_reference *drb = DDR_B (ddr);
2236   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2237   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2238   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2239   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2240
2241   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2242     return;
2243
2244   if (dra == drb)
2245     return;
2246
2247   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2248       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2249     return;
2250
2251   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2252       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2253       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2254     return;
2255
2256   /* Two references with distance zero have the same alignment.  */
2257   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2258                           - wi::to_poly_offset (DR_INIT (drb)));
2259   if (maybe_ne (diff, 0))
2260     {
2261       /* Get the wider of the two alignments.  */
2262       poly_uint64 align_a =
2263         exact_div (vect_calculate_target_alignment (dr_info_a),
2264                    BITS_PER_UNIT);
2265       poly_uint64 align_b =
2266         exact_div (vect_calculate_target_alignment (dr_info_b),
2267                    BITS_PER_UNIT);
2268       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2269       if (!align_a.is_constant (&align_a_c)
2270           || !align_b.is_constant (&align_b_c))
2271         return;
2272
2273       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2274
2275       /* Require the gap to be a multiple of the larger vector alignment.  */
2276       if (!multiple_p (diff, max_align))
2277         return;
2278     }
2279
2280   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2281   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2282   if (dump_enabled_p ())
2283     dump_printf_loc (MSG_NOTE, vect_location,
2284                      "accesses have the same alignment: %T and %T\n",
2285                      DR_REF (dra), DR_REF (drb));
2286 }
2287
2288
2289 /* Function vect_analyze_data_refs_alignment
2290
2291    Analyze the alignment of the data-references in the loop.
2292    Return FALSE if a data reference is found that cannot be vectorized.  */
2293
2294 opt_result
2295 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2296 {
2297   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2298
2299   /* Mark groups of data references with same alignment using
2300      data dependence information.  */
2301   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2302   struct data_dependence_relation *ddr;
2303   unsigned int i;
2304
2305   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2306     vect_find_same_alignment_drs (vinfo, ddr);
2307
2308   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2309   struct data_reference *dr;
2310
2311   vect_record_base_alignments (vinfo);
2312   FOR_EACH_VEC_ELT (datarefs, i, dr)
2313     {
2314       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2315       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2316         vect_compute_data_ref_alignment (dr_info);
2317     }
2318
2319   return opt_result::success ();
2320 }
2321
2322
2323 /* Analyze alignment of DRs of stmts in NODE.  */
2324
2325 static bool
2326 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2327 {
2328   /* We vectorize from the first scalar stmt in the node unless
2329      the node is permuted in which case we start from the first
2330      element in the group.  */
2331   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2332   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2333   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2334     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2335
2336   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2337   vect_compute_data_ref_alignment (dr_info);
2338   /* For creating the data-ref pointer we need alignment of the
2339      first element anyway.  */
2340   if (dr_info != first_dr_info)
2341     vect_compute_data_ref_alignment (first_dr_info);
2342   if (! verify_data_ref_alignment (dr_info))
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: bad data alignment in basic "
2347                          "block.\n");
2348       return false;
2349     }
2350
2351   return true;
2352 }
2353
2354 /* Function vect_slp_analyze_instance_alignment
2355
2356    Analyze the alignment of the data-references in the SLP instance.
2357    Return FALSE if a data reference is found that cannot be vectorized.  */
2358
2359 bool
2360 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2361 {
2362   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2363
2364   slp_tree node;
2365   unsigned i;
2366   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2367     if (! vect_slp_analyze_and_verify_node_alignment (node))
2368       return false;
2369
2370   node = SLP_INSTANCE_TREE (instance);
2371   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2372       && ! vect_slp_analyze_and_verify_node_alignment
2373              (SLP_INSTANCE_TREE (instance)))
2374     return false;
2375
2376   return true;
2377 }
2378
2379
2380 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2381    accesses of legal size, step, etc.  Detect gaps, single element
2382    interleaving, and other special cases. Set grouped access info.
2383    Collect groups of strided stores for further use in SLP analysis.
2384    Worker for vect_analyze_group_access.  */
2385
2386 static bool
2387 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2388 {
2389   data_reference *dr = dr_info->dr;
2390   tree step = DR_STEP (dr);
2391   tree scalar_type = TREE_TYPE (DR_REF (dr));
2392   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2393   stmt_vec_info stmt_info = dr_info->stmt;
2394   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2395   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2396   HOST_WIDE_INT dr_step = -1;
2397   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2398   bool slp_impossible = false;
2399
2400   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2401      size of the interleaving group (including gaps).  */
2402   if (tree_fits_shwi_p (step))
2403     {
2404       dr_step = tree_to_shwi (step);
2405       /* Check that STEP is a multiple of type size.  Otherwise there is
2406          a non-element-sized gap at the end of the group which we
2407          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2408          ???  As we can handle non-constant step fine here we should
2409          simply remove uses of DR_GROUP_GAP between the last and first
2410          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2411          simply not include that gap.  */
2412       if ((dr_step % type_size) != 0)
2413         {
2414           if (dump_enabled_p ())
2415             dump_printf_loc (MSG_NOTE, vect_location,
2416                              "Step %T is not a multiple of the element size"
2417                              " for %T\n",
2418                              step, DR_REF (dr));
2419           return false;
2420         }
2421       groupsize = absu_hwi (dr_step) / type_size;
2422     }
2423   else
2424     groupsize = 0;
2425
2426   /* Not consecutive access is possible only if it is a part of interleaving.  */
2427   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2428     {
2429       /* Check if it this DR is a part of interleaving, and is a single
2430          element of the group that is accessed in the loop.  */
2431
2432       /* Gaps are supported only for loads. STEP must be a multiple of the type
2433          size.  */
2434       if (DR_IS_READ (dr)
2435           && (dr_step % type_size) == 0
2436           && groupsize > 0)
2437         {
2438           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2439           DR_GROUP_SIZE (stmt_info) = groupsize;
2440           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2441           if (dump_enabled_p ())
2442             dump_printf_loc (MSG_NOTE, vect_location,
2443                              "Detected single element interleaving %T"
2444                              " step %T\n",
2445                              DR_REF (dr), step);
2446
2447           return true;
2448         }
2449
2450       if (dump_enabled_p ())
2451         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452                          "not consecutive access %G", stmt_info->stmt);
2453
2454       if (bb_vinfo)
2455         {
2456           /* Mark the statement as unvectorizable.  */
2457           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2458           return true;
2459         }
2460
2461       if (dump_enabled_p ())
2462         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2463       STMT_VINFO_STRIDED_P (stmt_info) = true;
2464       return true;
2465     }
2466
2467   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2468     {
2469       /* First stmt in the interleaving chain. Check the chain.  */
2470       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2471       struct data_reference *data_ref = dr;
2472       unsigned int count = 1;
2473       tree prev_init = DR_INIT (data_ref);
2474       stmt_vec_info prev = stmt_info;
2475       HOST_WIDE_INT diff, gaps = 0;
2476
2477       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2478       while (next)
2479         {
2480           /* Skip same data-refs.  In case that two or more stmts share
2481              data-ref (supported only for loads), we vectorize only the first
2482              stmt, and the rest get their vectorized loads from the first
2483              one.  */
2484           if (!tree_int_cst_compare (DR_INIT (data_ref),
2485                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2486             {
2487               if (DR_IS_WRITE (data_ref))
2488                 {
2489                   if (dump_enabled_p ())
2490                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2491                                      "Two store stmts share the same dr.\n");
2492                   return false;
2493                 }
2494
2495               if (dump_enabled_p ())
2496                 dump_printf_loc (MSG_NOTE, vect_location,
2497                                  "Two or more load stmts share the same dr.\n");
2498
2499               /* For load use the same data-ref load.  */
2500               DR_GROUP_SAME_DR_STMT (next) = prev;
2501
2502               prev = next;
2503               next = DR_GROUP_NEXT_ELEMENT (next);
2504               continue;
2505             }
2506
2507           prev = next;
2508           data_ref = STMT_VINFO_DATA_REF (next);
2509
2510           /* All group members have the same STEP by construction.  */
2511           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2512
2513           /* Check that the distance between two accesses is equal to the type
2514              size. Otherwise, we have gaps.  */
2515           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2516                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2517           if (diff != 1)
2518             {
2519               /* FORNOW: SLP of accesses with gaps is not supported.  */
2520               slp_impossible = true;
2521               if (DR_IS_WRITE (data_ref))
2522                 {
2523                   if (dump_enabled_p ())
2524                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2525                                      "interleaved store with gaps\n");
2526                   return false;
2527                 }
2528
2529               gaps += diff - 1;
2530             }
2531
2532           last_accessed_element += diff;
2533
2534           /* Store the gap from the previous member of the group. If there is no
2535              gap in the access, DR_GROUP_GAP is always 1.  */
2536           DR_GROUP_GAP (next) = diff;
2537
2538           prev_init = DR_INIT (data_ref);
2539           next = DR_GROUP_NEXT_ELEMENT (next);
2540           /* Count the number of data-refs in the chain.  */
2541           count++;
2542         }
2543
2544       if (groupsize == 0)
2545         groupsize = count + gaps;
2546
2547       /* This could be UINT_MAX but as we are generating code in a very
2548          inefficient way we have to cap earlier.  See PR78699 for example.  */
2549       if (groupsize > 4096)
2550         {
2551           if (dump_enabled_p ())
2552             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553                              "group is too large\n");
2554           return false;
2555         }
2556
2557       /* Check that the size of the interleaving is equal to count for stores,
2558          i.e., that there are no gaps.  */
2559       if (groupsize != count
2560           && !DR_IS_READ (dr))
2561         {
2562           groupsize = count;
2563           STMT_VINFO_STRIDED_P (stmt_info) = true;
2564         }
2565
2566       /* If there is a gap after the last load in the group it is the
2567          difference between the groupsize and the last accessed
2568          element.
2569          When there is no gap, this difference should be 0.  */
2570       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2571
2572       DR_GROUP_SIZE (stmt_info) = groupsize;
2573       if (dump_enabled_p ())
2574         {
2575           dump_printf_loc (MSG_NOTE, vect_location,
2576                            "Detected interleaving ");
2577           if (DR_IS_READ (dr))
2578             dump_printf (MSG_NOTE, "load ");
2579           else if (STMT_VINFO_STRIDED_P (stmt_info))
2580             dump_printf (MSG_NOTE, "strided store ");
2581           else
2582             dump_printf (MSG_NOTE, "store ");
2583           dump_printf (MSG_NOTE, "of size %u\n",
2584                        (unsigned)groupsize);
2585           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2586           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2587           while (next)
2588             {
2589               if (DR_GROUP_GAP (next) != 1)
2590                 dump_printf_loc (MSG_NOTE, vect_location,
2591                                  "\t<gap of %d elements>\n",
2592                                  DR_GROUP_GAP (next) - 1);
2593               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2594               next = DR_GROUP_NEXT_ELEMENT (next);
2595             }
2596           if (DR_GROUP_GAP (stmt_info) != 0)
2597             dump_printf_loc (MSG_NOTE, vect_location,
2598                              "\t<gap of %d elements>\n",
2599                              DR_GROUP_GAP (stmt_info));
2600         }
2601
2602       /* SLP: create an SLP data structure for every interleaving group of
2603          stores for further analysis in vect_analyse_slp.  */
2604       if (DR_IS_WRITE (dr) && !slp_impossible)
2605         {
2606           if (loop_vinfo)
2607             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2608           if (bb_vinfo)
2609             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2610         }
2611     }
2612
2613   return true;
2614 }
2615
2616 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2617    accesses of legal size, step, etc.  Detect gaps, single element
2618    interleaving, and other special cases. Set grouped access info.
2619    Collect groups of strided stores for further use in SLP analysis.  */
2620
2621 static bool
2622 vect_analyze_group_access (dr_vec_info *dr_info)
2623 {
2624   if (!vect_analyze_group_access_1 (dr_info))
2625     {
2626       /* Dissolve the group if present.  */
2627       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2628       while (stmt_info)
2629         {
2630           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2631           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2632           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2633           stmt_info = next;
2634         }
2635       return false;
2636     }
2637   return true;
2638 }
2639
2640 /* Analyze the access pattern of the data-reference DR_INFO.
2641    In case of non-consecutive accesses call vect_analyze_group_access() to
2642    analyze groups of accesses.  */
2643
2644 static bool
2645 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2646 {
2647   data_reference *dr = dr_info->dr;
2648   tree step = DR_STEP (dr);
2649   tree scalar_type = TREE_TYPE (DR_REF (dr));
2650   stmt_vec_info stmt_info = dr_info->stmt;
2651   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2652   struct loop *loop = NULL;
2653
2654   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2655     return true;
2656
2657   if (loop_vinfo)
2658     loop = LOOP_VINFO_LOOP (loop_vinfo);
2659
2660   if (loop_vinfo && !step)
2661     {
2662       if (dump_enabled_p ())
2663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2664                          "bad data-ref access in loop\n");
2665       return false;
2666     }
2667
2668   /* Allow loads with zero step in inner-loop vectorization.  */
2669   if (loop_vinfo && integer_zerop (step))
2670     {
2671       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2672       if (!nested_in_vect_loop_p (loop, stmt_info))
2673         return DR_IS_READ (dr);
2674       /* Allow references with zero step for outer loops marked
2675          with pragma omp simd only - it guarantees absence of
2676          loop-carried dependencies between inner loop iterations.  */
2677       if (loop->safelen < 2)
2678         {
2679           if (dump_enabled_p ())
2680             dump_printf_loc (MSG_NOTE, vect_location,
2681                              "zero step in inner loop of nest\n");
2682           return false;
2683         }
2684     }
2685
2686   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2687     {
2688       /* Interleaved accesses are not yet supported within outer-loop
2689         vectorization for references in the inner-loop.  */
2690       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2691
2692       /* For the rest of the analysis we use the outer-loop step.  */
2693       step = STMT_VINFO_DR_STEP (stmt_info);
2694       if (integer_zerop (step))
2695         {
2696           if (dump_enabled_p ())
2697             dump_printf_loc (MSG_NOTE, vect_location,
2698                              "zero step in outer loop.\n");
2699           return DR_IS_READ (dr);
2700         }
2701     }
2702
2703   /* Consecutive?  */
2704   if (TREE_CODE (step) == INTEGER_CST)
2705     {
2706       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2707       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2708           || (dr_step < 0
2709               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2710         {
2711           /* Mark that it is not interleaving.  */
2712           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2713           return true;
2714         }
2715     }
2716
2717   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2718     {
2719       if (dump_enabled_p ())
2720         dump_printf_loc (MSG_NOTE, vect_location,
2721                          "grouped access in outer loop.\n");
2722       return false;
2723     }
2724
2725
2726   /* Assume this is a DR handled by non-constant strided load case.  */
2727   if (TREE_CODE (step) != INTEGER_CST)
2728     return (STMT_VINFO_STRIDED_P (stmt_info)
2729             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2730                 || vect_analyze_group_access (dr_info)));
2731
2732   /* Not consecutive access - check if it's a part of interleaving group.  */
2733   return vect_analyze_group_access (dr_info);
2734 }
2735
2736 /* Compare two data-references DRA and DRB to group them into chunks
2737    suitable for grouping.  */
2738
2739 static int
2740 dr_group_sort_cmp (const void *dra_, const void *drb_)
2741 {
2742   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2743   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2744   int cmp;
2745
2746   /* Stabilize sort.  */
2747   if (dra == drb)
2748     return 0;
2749
2750   /* DRs in different loops never belong to the same group.  */
2751   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2752   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2753   if (loopa != loopb)
2754     return loopa->num < loopb->num ? -1 : 1;
2755
2756   /* Ordering of DRs according to base.  */
2757   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2758                                DR_BASE_ADDRESS (drb));
2759   if (cmp != 0)
2760     return cmp;
2761
2762   /* And according to DR_OFFSET.  */
2763   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2764   if (cmp != 0)
2765     return cmp;
2766
2767   /* Put reads before writes.  */
2768   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2769     return DR_IS_READ (dra) ? -1 : 1;
2770
2771   /* Then sort after access size.  */
2772   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2773                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2774   if (cmp != 0)
2775     return cmp;
2776
2777   /* And after step.  */
2778   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2779   if (cmp != 0)
2780     return cmp;
2781
2782   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2783   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2784   if (cmp == 0)
2785     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2786   return cmp;
2787 }
2788
2789 /* If OP is the result of a conversion, return the unconverted value,
2790    otherwise return null.  */
2791
2792 static tree
2793 strip_conversion (tree op)
2794 {
2795   if (TREE_CODE (op) != SSA_NAME)
2796     return NULL_TREE;
2797   gimple *stmt = SSA_NAME_DEF_STMT (op);
2798   if (!is_gimple_assign (stmt)
2799       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2800     return NULL_TREE;
2801   return gimple_assign_rhs1 (stmt);
2802 }
2803
2804 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2805    and STMT2_INFO being in a single group.  */
2806
2807 static bool
2808 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2809 {
2810   if (gimple_assign_single_p (stmt1_info->stmt))
2811     return gimple_assign_single_p (stmt2_info->stmt);
2812
2813   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2814   if (call1 && gimple_call_internal_p (call1))
2815     {
2816       /* Check for two masked loads or two masked stores.  */
2817       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2818       if (!call2 || !gimple_call_internal_p (call2))
2819         return false;
2820       internal_fn ifn = gimple_call_internal_fn (call1);
2821       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2822         return false;
2823       if (ifn != gimple_call_internal_fn (call2))
2824         return false;
2825
2826       /* Check that the masks are the same.  Cope with casts of masks,
2827          like those created by build_mask_conversion.  */
2828       tree mask1 = gimple_call_arg (call1, 2);
2829       tree mask2 = gimple_call_arg (call2, 2);
2830       if (!operand_equal_p (mask1, mask2, 0))
2831         {
2832           mask1 = strip_conversion (mask1);
2833           if (!mask1)
2834             return false;
2835           mask2 = strip_conversion (mask2);
2836           if (!mask2)
2837             return false;
2838           if (!operand_equal_p (mask1, mask2, 0))
2839             return false;
2840         }
2841       return true;
2842     }
2843
2844   return false;
2845 }
2846
2847 /* Function vect_analyze_data_ref_accesses.
2848
2849    Analyze the access pattern of all the data references in the loop.
2850
2851    FORNOW: the only access pattern that is considered vectorizable is a
2852            simple step 1 (consecutive) access.
2853
2854    FORNOW: handle only arrays and pointer accesses.  */
2855
2856 opt_result
2857 vect_analyze_data_ref_accesses (vec_info *vinfo)
2858 {
2859   unsigned int i;
2860   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2861   struct data_reference *dr;
2862
2863   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2864
2865   if (datarefs.is_empty ())
2866     return opt_result::success ();
2867
2868   /* Sort the array of datarefs to make building the interleaving chains
2869      linear.  Don't modify the original vector's order, it is needed for
2870      determining what dependencies are reversed.  */
2871   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2872   datarefs_copy.qsort (dr_group_sort_cmp);
2873   hash_set<stmt_vec_info> to_fixup;
2874
2875   /* Build the interleaving chains.  */
2876   for (i = 0; i < datarefs_copy.length () - 1;)
2877     {
2878       data_reference_p dra = datarefs_copy[i];
2879       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2880       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2881       stmt_vec_info lastinfo = NULL;
2882       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2883           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2884         {
2885           ++i;
2886           continue;
2887         }
2888       for (i = i + 1; i < datarefs_copy.length (); ++i)
2889         {
2890           data_reference_p drb = datarefs_copy[i];
2891           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2892           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2893           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2894               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2895             break;
2896
2897           /* ???  Imperfect sorting (non-compatible types, non-modulo
2898              accesses, same accesses) can lead to a group to be artificially
2899              split here as we don't just skip over those.  If it really
2900              matters we can push those to a worklist and re-iterate
2901              over them.  The we can just skip ahead to the next DR here.  */
2902
2903           /* DRs in a different loop should not be put into the same
2904              interleaving group.  */
2905           if (gimple_bb (DR_STMT (dra))->loop_father
2906               != gimple_bb (DR_STMT (drb))->loop_father)
2907             break;
2908
2909           /* Check that the data-refs have same first location (except init)
2910              and they are both either store or load (not load and store,
2911              not masked loads or stores).  */
2912           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2913               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2914                                         DR_BASE_ADDRESS (drb)) != 0
2915               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2916               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2917             break;
2918
2919           /* Check that the data-refs have the same constant size.  */
2920           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2921           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2922           if (!tree_fits_uhwi_p (sza)
2923               || !tree_fits_uhwi_p (szb)
2924               || !tree_int_cst_equal (sza, szb))
2925             break;
2926
2927           /* Check that the data-refs have the same step.  */
2928           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2929             break;
2930
2931           /* Check the types are compatible.
2932              ???  We don't distinguish this during sorting.  */
2933           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2934                                    TREE_TYPE (DR_REF (drb))))
2935             break;
2936
2937           /* Check that the DR_INITs are compile-time constants.  */
2938           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2939               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2940             break;
2941
2942           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2943           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2944           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2945           HOST_WIDE_INT init_prev
2946             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2947           gcc_assert (init_a <= init_b
2948                       && init_a <= init_prev
2949                       && init_prev <= init_b);
2950
2951           /* Do not place the same access in the interleaving chain twice.  */
2952           if (init_b == init_prev)
2953             {
2954               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2955                           < gimple_uid (DR_STMT (drb)));
2956               /* Simply link in duplicates and fix up the chain below.  */
2957             }
2958           else
2959             {
2960               /* If init_b == init_a + the size of the type * k, we have an
2961                  interleaving, and DRA is accessed before DRB.  */
2962               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2963               if (type_size_a == 0
2964                   || (init_b - init_a) % type_size_a != 0)
2965                 break;
2966
2967               /* If we have a store, the accesses are adjacent.  This splits
2968                  groups into chunks we support (we don't support vectorization
2969                  of stores with gaps).  */
2970               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2971                 break;
2972
2973               /* If the step (if not zero or non-constant) is greater than the
2974                  difference between data-refs' inits this splits groups into
2975                  suitable sizes.  */
2976               if (tree_fits_shwi_p (DR_STEP (dra)))
2977                 {
2978                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2979                   if (step != 0 && step <= (init_b - init_a))
2980                     break;
2981                 }
2982             }
2983
2984           if (dump_enabled_p ())
2985             dump_printf_loc (MSG_NOTE, vect_location,
2986                              DR_IS_READ (dra)
2987                              ? "Detected interleaving load %T and %T\n"
2988                              : "Detected interleaving store %T and %T\n",
2989                              DR_REF (dra), DR_REF (drb));
2990
2991           /* Link the found element into the group list.  */
2992           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2993             {
2994               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2995               lastinfo = stmtinfo_a;
2996             }
2997           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2998           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2999           lastinfo = stmtinfo_b;
3000
3001           if (init_b == init_prev
3002               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3003               && dump_enabled_p ())
3004             dump_printf_loc (MSG_NOTE, vect_location,
3005                              "Queuing group with duplicate access for fixup\n");
3006         }
3007     }
3008
3009   /* Fixup groups with duplicate entries by splitting it.  */
3010   while (1)
3011     {
3012       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3013       if (!(it != to_fixup.end ()))
3014         break;
3015       stmt_vec_info grp = *it;
3016       to_fixup.remove (grp);
3017
3018       /* Find the earliest duplicate group member.  */
3019       unsigned first_duplicate = -1u;
3020       stmt_vec_info next, g = grp;
3021       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3022         {
3023           if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
3024                == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3025               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3026             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3027           g = next;
3028         }
3029       if (first_duplicate == -1U)
3030         continue;
3031
3032       /* Then move all stmts after the first duplicate to a new group.
3033          Note this is a heuristic but one with the property that *it
3034          is fixed up completely.  */
3035       g = grp;
3036       stmt_vec_info newgroup = NULL, ng = grp;
3037       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3038         {
3039           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3040             {
3041               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3042               if (!newgroup)
3043                 newgroup = next;
3044               else
3045                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3046               ng = next;
3047               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3048             }
3049           else
3050             g = DR_GROUP_NEXT_ELEMENT (g);
3051         }
3052       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3053
3054       /* Fixup the new group which still may contain duplicates.  */
3055       to_fixup.add (newgroup);
3056     }
3057
3058   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3059     {
3060       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3061       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3062           && !vect_analyze_data_ref_access (dr_info))
3063         {
3064           if (dump_enabled_p ())
3065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3066                              "not vectorized: complicated access pattern.\n");
3067
3068           if (is_a <bb_vec_info> (vinfo))
3069             {
3070               /* Mark the statement as not vectorizable.  */
3071               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3072               continue;
3073             }
3074           else
3075             {
3076               datarefs_copy.release ();
3077               return opt_result::failure_at (dr_info->stmt->stmt,
3078                                              "not vectorized:"
3079                                              " complicated access pattern.\n");
3080             }
3081         }
3082     }
3083
3084   datarefs_copy.release ();
3085   return opt_result::success ();
3086 }
3087
3088 /* Function vect_vfa_segment_size.
3089
3090    Input:
3091      DR_INFO: The data reference.
3092      LENGTH_FACTOR: segment length to consider.
3093
3094    Return a value suitable for the dr_with_seg_len::seg_len field.
3095    This is the "distance travelled" by the pointer from the first
3096    iteration in the segment to the last.  Note that it does not include
3097    the size of the access; in effect it only describes the first byte.  */
3098
3099 static tree
3100 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3101 {
3102   length_factor = size_binop (MINUS_EXPR,
3103                               fold_convert (sizetype, length_factor),
3104                               size_one_node);
3105   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3106                      length_factor);
3107 }
3108
3109 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3110    gives the worst-case number of bytes covered by the segment.  */
3111
3112 static unsigned HOST_WIDE_INT
3113 vect_vfa_access_size (dr_vec_info *dr_info)
3114 {
3115   stmt_vec_info stmt_vinfo = dr_info->stmt;
3116   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3117   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3118   unsigned HOST_WIDE_INT access_size = ref_size;
3119   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3120     {
3121       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3122       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3123     }
3124   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3125       && (vect_supportable_dr_alignment (dr_info, false)
3126           == dr_explicit_realign_optimized))
3127     {
3128       /* We might access a full vector's worth.  */
3129       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3130       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3131     }
3132   return access_size;
3133 }
3134
3135 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3136    describes.  */
3137
3138 static unsigned int
3139 vect_vfa_align (dr_vec_info *dr_info)
3140 {
3141   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3142 }
3143
3144 /* Function vect_no_alias_p.
3145
3146    Given data references A and B with equal base and offset, see whether
3147    the alias relation can be decided at compilation time.  Return 1 if
3148    it can and the references alias, 0 if it can and the references do
3149    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3150    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3151    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3152
3153 static int
3154 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3155                          tree segment_length_a, tree segment_length_b,
3156                          unsigned HOST_WIDE_INT access_size_a,
3157                          unsigned HOST_WIDE_INT access_size_b)
3158 {
3159   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3160   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3161   poly_uint64 const_length_a;
3162   poly_uint64 const_length_b;
3163
3164   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3165      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3166      [a, a+12) */
3167   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3168     {
3169       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3170       offset_a = (offset_a + access_size_a) - const_length_a;
3171     }
3172   else
3173     const_length_a = tree_to_poly_uint64 (segment_length_a);
3174   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3175     {
3176       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3177       offset_b = (offset_b + access_size_b) - const_length_b;
3178     }
3179   else
3180     const_length_b = tree_to_poly_uint64 (segment_length_b);
3181
3182   const_length_a += access_size_a;
3183   const_length_b += access_size_b;
3184
3185   if (ranges_known_overlap_p (offset_a, const_length_a,
3186                               offset_b, const_length_b))
3187     return 1;
3188
3189   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3190                                offset_b, const_length_b))
3191     return 0;
3192
3193   return -1;
3194 }
3195
3196 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3197    in DDR is >= VF.  */
3198
3199 static bool
3200 dependence_distance_ge_vf (data_dependence_relation *ddr,
3201                            unsigned int loop_depth, poly_uint64 vf)
3202 {
3203   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3204       || DDR_NUM_DIST_VECTS (ddr) == 0)
3205     return false;
3206
3207   /* If the dependence is exact, we should have limited the VF instead.  */
3208   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3209
3210   unsigned int i;
3211   lambda_vector dist_v;
3212   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3213     {
3214       HOST_WIDE_INT dist = dist_v[loop_depth];
3215       if (dist != 0
3216           && !(dist > 0 && DDR_REVERSED_P (ddr))
3217           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3218         return false;
3219     }
3220
3221   if (dump_enabled_p ())
3222     dump_printf_loc (MSG_NOTE, vect_location,
3223                      "dependence distance between %T and %T is >= VF\n",
3224                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3225
3226   return true;
3227 }
3228
3229 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3230
3231 static void
3232 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3233 {
3234   dump_printf (dump_kind, "%s (%T) >= ",
3235                lower_bound.unsigned_p ? "unsigned" : "abs",
3236                lower_bound.expr);
3237   dump_dec (dump_kind, lower_bound.min_value);
3238 }
3239
3240 /* Record that the vectorized loop requires the vec_lower_bound described
3241    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3242
3243 static void
3244 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3245                         poly_uint64 min_value)
3246 {
3247   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3248   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3249     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3250       {
3251         unsigned_p &= lower_bounds[i].unsigned_p;
3252         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3253         if (lower_bounds[i].unsigned_p != unsigned_p
3254             || maybe_lt (lower_bounds[i].min_value, min_value))
3255           {
3256             lower_bounds[i].unsigned_p = unsigned_p;
3257             lower_bounds[i].min_value = min_value;
3258             if (dump_enabled_p ())
3259               {
3260                 dump_printf_loc (MSG_NOTE, vect_location,
3261                                  "updating run-time check to ");
3262                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3263                 dump_printf (MSG_NOTE, "\n");
3264               }
3265           }
3266         return;
3267       }
3268
3269   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3270   if (dump_enabled_p ())
3271     {
3272       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3273       dump_lower_bound (MSG_NOTE, lower_bound);
3274       dump_printf (MSG_NOTE, "\n");
3275     }
3276   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3277 }
3278
3279 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3280    will span fewer than GAP bytes.  */
3281
3282 static bool
3283 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3284                   poly_int64 gap)
3285 {
3286   stmt_vec_info stmt_info = dr_info->stmt;
3287   HOST_WIDE_INT count
3288     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3289   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3290     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3291   return (estimated_poly_value (gap)
3292           <= count * vect_get_scalar_dr_size (dr_info));
3293 }
3294
3295 /* Return true if we know that there is no alias between DR_INFO_A and
3296    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3297    When returning true, set *LOWER_BOUND_OUT to this N.  */
3298
3299 static bool
3300 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3301                                 poly_uint64 *lower_bound_out)
3302 {
3303   /* Check that there is a constant gap of known sign between DR_A
3304      and DR_B.  */
3305   data_reference *dr_a = dr_info_a->dr;
3306   data_reference *dr_b = dr_info_b->dr;
3307   poly_int64 init_a, init_b;
3308   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3309       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3310       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3311       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3312       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3313       || !ordered_p (init_a, init_b))
3314     return false;
3315
3316   /* Sort DR_A and DR_B by the address they access.  */
3317   if (maybe_lt (init_b, init_a))
3318     {
3319       std::swap (init_a, init_b);
3320       std::swap (dr_info_a, dr_info_b);
3321       std::swap (dr_a, dr_b);
3322     }
3323
3324   /* If the two accesses could be dependent within a scalar iteration,
3325      make sure that we'd retain their order.  */
3326   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3327       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3328     return false;
3329
3330   /* There is no alias if abs (DR_STEP) is greater than or equal to
3331      the bytes spanned by the combination of the two accesses.  */
3332   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3333   return true;
3334 }
3335
3336 /* Function vect_prune_runtime_alias_test_list.
3337
3338    Prune a list of ddrs to be tested at run-time by versioning for alias.
3339    Merge several alias checks into one if possible.
3340    Return FALSE if resulting list of ddrs is longer then allowed by
3341    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3342
3343 opt_result
3344 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3345 {
3346   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3347   hash_set <tree_pair_hash> compared_objects;
3348
3349   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3350   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3351     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3352   vec<vec_object_pair> &check_unequal_addrs
3353     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3354   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3355   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3356
3357   ddr_p ddr;
3358   unsigned int i;
3359   tree length_factor;
3360
3361   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3362
3363   /* Step values are irrelevant for aliasing if the number of vector
3364      iterations is equal to the number of scalar iterations (which can
3365      happen for fully-SLP loops).  */
3366   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3367
3368   if (!ignore_step_p)
3369     {
3370       /* Convert the checks for nonzero steps into bound tests.  */
3371       tree value;
3372       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3373         vect_check_lower_bound (loop_vinfo, value, true, 1);
3374     }
3375
3376   if (may_alias_ddrs.is_empty ())
3377     return opt_result::success ();
3378
3379   comp_alias_ddrs.create (may_alias_ddrs.length ());
3380
3381   unsigned int loop_depth
3382     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3383                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3384
3385   /* First, we collect all data ref pairs for aliasing checks.  */
3386   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3387     {
3388       int comp_res;
3389       poly_uint64 lower_bound;
3390       tree segment_length_a, segment_length_b;
3391       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3392       unsigned int align_a, align_b;
3393
3394       /* Ignore the alias if the VF we chose ended up being no greater
3395          than the dependence distance.  */
3396       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3397         continue;
3398
3399       if (DDR_OBJECT_A (ddr))
3400         {
3401           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3402           if (!compared_objects.add (new_pair))
3403             {
3404               if (dump_enabled_p ())
3405                 dump_printf_loc (MSG_NOTE, vect_location,
3406                                  "checking that %T and %T"
3407                                  " have different addresses\n",
3408                                  new_pair.first, new_pair.second);
3409               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3410             }
3411           continue;
3412         }
3413
3414       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3415       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3416
3417       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3418       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3419
3420       /* Skip the pair if inter-iteration dependencies are irrelevant
3421          and intra-iteration dependencies are guaranteed to be honored.  */
3422       if (ignore_step_p
3423           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3424               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3425                                                  &lower_bound)))
3426         {
3427           if (dump_enabled_p ())
3428             dump_printf_loc (MSG_NOTE, vect_location,
3429                              "no need for alias check between "
3430                              "%T and %T when VF is 1\n",
3431                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3432           continue;
3433         }
3434
3435       /* See whether we can handle the alias using a bounds check on
3436          the step, and whether that's likely to be the best approach.
3437          (It might not be, for example, if the minimum step is much larger
3438          than the number of bytes handled by one vector iteration.)  */
3439       if (!ignore_step_p
3440           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3441           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3442                                              &lower_bound)
3443           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3444               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3445         {
3446           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3447           if (dump_enabled_p ())
3448             {
3449               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3450                                "%T and %T when the step %T is outside ",
3451                                DR_REF (dr_info_a->dr),
3452                                DR_REF (dr_info_b->dr),
3453                                DR_STEP (dr_info_a->dr));
3454               if (unsigned_p)
3455                 dump_printf (MSG_NOTE, "[0");
3456               else
3457                 {
3458                   dump_printf (MSG_NOTE, "(");
3459                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3460                 }
3461               dump_printf (MSG_NOTE, ", ");
3462               dump_dec (MSG_NOTE, lower_bound);
3463               dump_printf (MSG_NOTE, ")\n");
3464             }
3465           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3466                                   unsigned_p, lower_bound);
3467           continue;
3468         }
3469
3470       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3471       if (dr_group_first_a)
3472         {
3473           stmt_info_a = dr_group_first_a;
3474           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3475         }
3476
3477       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3478       if (dr_group_first_b)
3479         {
3480           stmt_info_b = dr_group_first_b;
3481           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3482         }
3483
3484       if (ignore_step_p)
3485         {
3486           segment_length_a = size_zero_node;
3487           segment_length_b = size_zero_node;
3488         }
3489       else
3490         {
3491           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3492                                 DR_STEP (dr_info_b->dr), 0))
3493             length_factor = scalar_loop_iters;
3494           else
3495             length_factor = size_int (vect_factor);
3496           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3497           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3498         }
3499       access_size_a = vect_vfa_access_size (dr_info_a);
3500       access_size_b = vect_vfa_access_size (dr_info_b);
3501       align_a = vect_vfa_align (dr_info_a);
3502       align_b = vect_vfa_align (dr_info_b);
3503
3504       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3505                                         DR_BASE_ADDRESS (dr_info_b->dr));
3506       if (comp_res == 0)
3507         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3508                                           DR_OFFSET (dr_info_b->dr));
3509
3510       /* See whether the alias is known at compilation time.  */
3511       if (comp_res == 0
3512           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3513           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3514           && poly_int_tree_p (segment_length_a)
3515           && poly_int_tree_p (segment_length_b))
3516         {
3517           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3518                                              segment_length_a,
3519                                              segment_length_b,
3520                                              access_size_a,
3521                                              access_size_b);
3522           if (res >= 0 && dump_enabled_p ())
3523             {
3524               dump_printf_loc (MSG_NOTE, vect_location,
3525                                "can tell at compile time that %T and %T",
3526                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3527               if (res == 0)
3528                 dump_printf (MSG_NOTE, " do not alias\n");
3529               else
3530                 dump_printf (MSG_NOTE, " alias\n");
3531             }
3532
3533           if (res == 0)
3534             continue;
3535
3536           if (res == 1)
3537             return opt_result::failure_at (stmt_info_b->stmt,
3538                                            "not vectorized:"
3539                                            " compilation time alias: %G%G",
3540                                            stmt_info_a->stmt,
3541                                            stmt_info_b->stmt);
3542         }
3543
3544       dr_with_seg_len_pair_t dr_with_seg_len_pair
3545         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3546                           access_size_a, align_a),
3547          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3548                           access_size_b, align_b));
3549
3550       /* Canonicalize pairs by sorting the two DR members.  */
3551       if (comp_res > 0)
3552         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3553
3554       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3555     }
3556
3557   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3558
3559   unsigned int count = (comp_alias_ddrs.length ()
3560                         + check_unequal_addrs.length ());
3561
3562   if (dump_enabled_p ())
3563     dump_printf_loc (MSG_NOTE, vect_location,
3564                      "improved number of alias checks from %d to %d\n",
3565                      may_alias_ddrs.length (), count);
3566   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3567     return opt_result::failure_at
3568       (vect_location,
3569        "number of versioning for alias "
3570        "run-time tests exceeds %d "
3571        "(--param vect-max-version-for-alias-checks)\n",
3572        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3573
3574   return opt_result::success ();
3575 }
3576
3577 /* Check whether we can use an internal function for a gather load
3578    or scatter store.  READ_P is true for loads and false for stores.
3579    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3580    the type of the memory elements being loaded or stored.  OFFSET_BITS
3581    is the number of bits in each scalar offset and OFFSET_SIGN is the
3582    sign of the offset.  SCALE is the amount by which the offset should
3583    be multiplied *after* it has been converted to address width.
3584
3585    Return true if the function is supported, storing the function
3586    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3587
3588 bool
3589 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3590                           tree memory_type, unsigned int offset_bits,
3591                           signop offset_sign, int scale,
3592                           internal_fn *ifn_out, tree *element_type_out)
3593 {
3594   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3595   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3596   if (offset_bits > element_bits)
3597     /* Internal functions require the offset to be the same width as
3598        the vector elements.  We can extend narrower offsets, but it isn't
3599        safe to truncate wider offsets.  */
3600     return false;
3601
3602   if (element_bits != memory_bits)
3603     /* For now the vector elements must be the same width as the
3604        memory elements.  */
3605     return false;
3606
3607   /* Work out which function we need.  */
3608   internal_fn ifn;
3609   if (read_p)
3610     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3611   else
3612     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3613
3614   /* Test whether the target supports this combination.  */
3615   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3616                                                offset_sign, scale))
3617     return false;
3618
3619   *ifn_out = ifn;
3620   *element_type_out = TREE_TYPE (vectype);
3621   return true;
3622 }
3623
3624 /* STMT_INFO is a call to an internal gather load or scatter store function.
3625    Describe the operation in INFO.  */
3626
3627 static void
3628 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3629                                    gather_scatter_info *info)
3630 {
3631   gcall *call = as_a <gcall *> (stmt_info->stmt);
3632   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3633   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3634
3635   info->ifn = gimple_call_internal_fn (call);
3636   info->decl = NULL_TREE;
3637   info->base = gimple_call_arg (call, 0);
3638   info->offset = gimple_call_arg (call, 1);
3639   info->offset_dt = vect_unknown_def_type;
3640   info->offset_vectype = NULL_TREE;
3641   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3642   info->element_type = TREE_TYPE (vectype);
3643   info->memory_type = TREE_TYPE (DR_REF (dr));
3644 }
3645
3646 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3647    gather load or scatter store.  Describe the operation in *INFO if so.  */
3648
3649 bool
3650 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3651                            gather_scatter_info *info)
3652 {
3653   HOST_WIDE_INT scale = 1;
3654   poly_int64 pbitpos, pbitsize;
3655   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3656   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3657   tree offtype = NULL_TREE;
3658   tree decl = NULL_TREE, base, off;
3659   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3660   tree memory_type = TREE_TYPE (DR_REF (dr));
3661   machine_mode pmode;
3662   int punsignedp, reversep, pvolatilep = 0;
3663   internal_fn ifn;
3664   tree element_type;
3665   bool masked_p = false;
3666
3667   /* See whether this is already a call to a gather/scatter internal function.
3668      If not, see whether it's a masked load or store.  */
3669   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3670   if (call && gimple_call_internal_p (call))
3671     {
3672       ifn = gimple_call_internal_fn (call);
3673       if (internal_gather_scatter_fn_p (ifn))
3674         {
3675           vect_describe_gather_scatter_call (stmt_info, info);
3676           return true;
3677         }
3678       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3679     }
3680
3681   /* True if we should aim to use internal functions rather than
3682      built-in functions.  */
3683   bool use_ifn_p = (DR_IS_READ (dr)
3684                     ? supports_vec_gather_load_p ()
3685                     : supports_vec_scatter_store_p ());
3686
3687   base = DR_REF (dr);
3688   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3689      see if we can use the def stmt of the address.  */
3690   if (masked_p
3691       && TREE_CODE (base) == MEM_REF
3692       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3693       && integer_zerop (TREE_OPERAND (base, 1))
3694       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3695     {
3696       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3697       if (is_gimple_assign (def_stmt)
3698           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3699         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3700     }
3701
3702   /* The gather and scatter builtins need address of the form
3703      loop_invariant + vector * {1, 2, 4, 8}
3704      or
3705      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3706      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3707      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3708      multiplications and additions in it.  To get a vector, we need
3709      a single SSA_NAME that will be defined in the loop and will
3710      contain everything that is not loop invariant and that can be
3711      vectorized.  The following code attempts to find such a preexistng
3712      SSA_NAME OFF and put the loop invariants into a tree BASE
3713      that can be gimplified before the loop.  */
3714   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3715                               &punsignedp, &reversep, &pvolatilep);
3716   if (reversep)
3717     return false;
3718
3719   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3720
3721   if (TREE_CODE (base) == MEM_REF)
3722     {
3723       if (!integer_zerop (TREE_OPERAND (base, 1)))
3724         {
3725           if (off == NULL_TREE)
3726             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3727           else
3728             off = size_binop (PLUS_EXPR, off,
3729                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3730         }
3731       base = TREE_OPERAND (base, 0);
3732     }
3733   else
3734     base = build_fold_addr_expr (base);
3735
3736   if (off == NULL_TREE)
3737     off = size_zero_node;
3738
3739   /* If base is not loop invariant, either off is 0, then we start with just
3740      the constant offset in the loop invariant BASE and continue with base
3741      as OFF, otherwise give up.
3742      We could handle that case by gimplifying the addition of base + off
3743      into some SSA_NAME and use that as off, but for now punt.  */
3744   if (!expr_invariant_in_loop_p (loop, base))
3745     {
3746       if (!integer_zerop (off))
3747         return false;
3748       off = base;
3749       base = size_int (pbytepos);
3750     }
3751   /* Otherwise put base + constant offset into the loop invariant BASE
3752      and continue with OFF.  */
3753   else
3754     {
3755       base = fold_convert (sizetype, base);
3756       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3757     }
3758
3759   /* OFF at this point may be either a SSA_NAME or some tree expression
3760      from get_inner_reference.  Try to peel off loop invariants from it
3761      into BASE as long as possible.  */
3762   STRIP_NOPS (off);
3763   while (offtype == NULL_TREE)
3764     {
3765       enum tree_code code;
3766       tree op0, op1, add = NULL_TREE;
3767
3768       if (TREE_CODE (off) == SSA_NAME)
3769         {
3770           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3771
3772           if (expr_invariant_in_loop_p (loop, off))
3773             return false;
3774
3775           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3776             break;
3777
3778           op0 = gimple_assign_rhs1 (def_stmt);
3779           code = gimple_assign_rhs_code (def_stmt);
3780           op1 = gimple_assign_rhs2 (def_stmt);
3781         }
3782       else
3783         {
3784           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3785             return false;
3786           code = TREE_CODE (off);
3787           extract_ops_from_tree (off, &code, &op0, &op1);
3788         }
3789       switch (code)
3790         {
3791         case POINTER_PLUS_EXPR:
3792         case PLUS_EXPR:
3793           if (expr_invariant_in_loop_p (loop, op0))
3794             {
3795               add = op0;
3796               off = op1;
3797             do_add:
3798               add = fold_convert (sizetype, add);
3799               if (scale != 1)
3800                 add = size_binop (MULT_EXPR, add, size_int (scale));
3801               base = size_binop (PLUS_EXPR, base, add);
3802               continue;
3803             }
3804           if (expr_invariant_in_loop_p (loop, op1))
3805             {
3806               add = op1;
3807               off = op0;
3808               goto do_add;
3809             }
3810           break;
3811         case MINUS_EXPR:
3812           if (expr_invariant_in_loop_p (loop, op1))
3813             {
3814               add = fold_convert (sizetype, op1);
3815               add = size_binop (MINUS_EXPR, size_zero_node, add);
3816               off = op0;
3817               goto do_add;
3818             }
3819           break;
3820         case MULT_EXPR:
3821           if (scale == 1 && tree_fits_shwi_p (op1))
3822             {
3823               int new_scale = tree_to_shwi (op1);
3824               /* Only treat this as a scaling operation if the target
3825                  supports it.  */
3826               if (use_ifn_p
3827                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3828                                                 vectype, memory_type, 1,
3829                                                 TYPE_SIGN (TREE_TYPE (op0)),
3830                                                 new_scale, &ifn,
3831                                                 &element_type))
3832                 break;
3833               scale = new_scale;
3834               off = op0;
3835               continue;
3836             }
3837           break;
3838         case SSA_NAME:
3839           off = op0;
3840           continue;
3841         CASE_CONVERT:
3842           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3843               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3844             break;
3845           if (TYPE_PRECISION (TREE_TYPE (op0))
3846               == TYPE_PRECISION (TREE_TYPE (off)))
3847             {
3848               off = op0;
3849               continue;
3850             }
3851
3852           /* The internal functions need the offset to be the same width
3853              as the elements of VECTYPE.  Don't include operations that
3854              cast the offset from that width to a different width.  */
3855           if (use_ifn_p
3856               && (int_size_in_bytes (TREE_TYPE (vectype))
3857                   == int_size_in_bytes (TREE_TYPE (off))))
3858             break;
3859
3860           if (TYPE_PRECISION (TREE_TYPE (op0))
3861               < TYPE_PRECISION (TREE_TYPE (off)))
3862             {
3863               off = op0;
3864               offtype = TREE_TYPE (off);
3865               STRIP_NOPS (off);
3866               continue;
3867             }
3868           break;
3869         default:
3870           break;
3871         }
3872       break;
3873     }
3874
3875   /* If at the end OFF still isn't a SSA_NAME or isn't
3876      defined in the loop, punt.  */
3877   if (TREE_CODE (off) != SSA_NAME
3878       || expr_invariant_in_loop_p (loop, off))
3879     return false;
3880
3881   if (offtype == NULL_TREE)
3882     offtype = TREE_TYPE (off);
3883
3884   if (use_ifn_p)
3885     {
3886       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3887                                      memory_type, TYPE_PRECISION (offtype),
3888                                      TYPE_SIGN (offtype), scale, &ifn,
3889                                      &element_type))
3890         return false;
3891     }
3892   else
3893     {
3894       if (DR_IS_READ (dr))
3895         {
3896           if (targetm.vectorize.builtin_gather)
3897             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3898         }
3899       else
3900         {
3901           if (targetm.vectorize.builtin_scatter)
3902             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3903         }
3904
3905       if (!decl)
3906         return false;
3907
3908       ifn = IFN_LAST;
3909       element_type = TREE_TYPE (vectype);
3910     }
3911
3912   info->ifn = ifn;
3913   info->decl = decl;
3914   info->base = base;
3915   info->offset = off;
3916   info->offset_dt = vect_unknown_def_type;
3917   info->offset_vectype = NULL_TREE;
3918   info->scale = scale;
3919   info->element_type = element_type;
3920   info->memory_type = memory_type;
3921   return true;
3922 }
3923
3924 /* Find the data references in STMT, analyze them with respect to LOOP and
3925    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3926    be handled.  */
3927
3928 opt_result
3929 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3930                                vec<data_reference_p> *datarefs)
3931 {
3932   /* We can ignore clobbers for dataref analysis - they are removed during
3933      loop vectorization and BB vectorization checks dependences with a
3934      stmt walk.  */
3935   if (gimple_clobber_p (stmt))
3936     return opt_result::success ();
3937
3938   if (gimple_has_volatile_ops (stmt))
3939     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3940                                    stmt);
3941
3942   if (stmt_can_throw_internal (cfun, stmt))
3943     return opt_result::failure_at (stmt,
3944                                    "not vectorized:"
3945                                    " statement can throw an exception: %G",
3946                                    stmt);
3947
3948   auto_vec<data_reference_p, 2> refs;
3949   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3950   if (!res)
3951     return res;
3952
3953   if (refs.is_empty ())
3954     return opt_result::success ();
3955
3956   if (refs.length () > 1)
3957     return opt_result::failure_at (stmt,
3958                                    "not vectorized:"
3959                                    " more than one data ref in stmt: %G", stmt);
3960
3961   if (gcall *call = dyn_cast <gcall *> (stmt))
3962     if (!gimple_call_internal_p (call)
3963         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3964             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3965       return opt_result::failure_at (stmt,
3966                                      "not vectorized: dr in a call %G", stmt);
3967
3968   data_reference_p dr = refs.pop ();
3969   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3970       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3971     return opt_result::failure_at (stmt,
3972                                    "not vectorized:"
3973                                    " statement is bitfield access %G", stmt);
3974
3975   if (DR_BASE_ADDRESS (dr)
3976       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3977     return opt_result::failure_at (stmt,
3978                                    "not vectorized:"
3979                                    " base addr of dr is a constant\n");
3980
3981   /* Check whether this may be a SIMD lane access and adjust the
3982      DR to make it easier for us to handle it.  */
3983   if (loop
3984       && loop->simduid
3985       && (!DR_BASE_ADDRESS (dr)
3986           || !DR_OFFSET (dr)
3987           || !DR_INIT (dr)
3988           || !DR_STEP (dr)))
3989     {
3990       struct data_reference *newdr
3991         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3992                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3993       if (DR_BASE_ADDRESS (newdr)
3994           && DR_OFFSET (newdr)
3995           && DR_INIT (newdr)
3996           && DR_STEP (newdr)
3997           && integer_zerop (DR_STEP (newdr)))
3998         {
3999           tree off = DR_OFFSET (newdr);
4000           STRIP_NOPS (off);
4001           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4002               && TREE_CODE (off) == MULT_EXPR
4003               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4004             {
4005               tree step = TREE_OPERAND (off, 1);
4006               off = TREE_OPERAND (off, 0);
4007               STRIP_NOPS (off);
4008               if (CONVERT_EXPR_P (off)
4009                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4010                       < TYPE_PRECISION (TREE_TYPE (off))))
4011                 off = TREE_OPERAND (off, 0);
4012               if (TREE_CODE (off) == SSA_NAME)
4013                 {
4014                   gimple *def = SSA_NAME_DEF_STMT (off);
4015                   tree reft = TREE_TYPE (DR_REF (newdr));
4016                   if (is_gimple_call (def)
4017                       && gimple_call_internal_p (def)
4018                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4019                     {
4020                       tree arg = gimple_call_arg (def, 0);
4021                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4022                       arg = SSA_NAME_VAR (arg);
4023                       if (arg == loop->simduid
4024                           /* For now.  */
4025                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4026                         {
4027                           DR_OFFSET (newdr) = ssize_int (0);
4028                           DR_STEP (newdr) = step;
4029                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4030                           DR_STEP_ALIGNMENT (newdr)
4031                             = highest_pow2_factor (step);
4032                           /* Mark as simd-lane access.  */
4033                           newdr->aux = (void *)-1;
4034                           free_data_ref (dr);
4035                           datarefs->safe_push (newdr);
4036                           return opt_result::success ();
4037                         }
4038                     }
4039                 }
4040             }
4041         }
4042       free_data_ref (newdr);
4043     }
4044
4045   datarefs->safe_push (dr);
4046   return opt_result::success ();
4047 }
4048
4049 /* Function vect_analyze_data_refs.
4050
4051   Find all the data references in the loop or basic block.
4052
4053    The general structure of the analysis of data refs in the vectorizer is as
4054    follows:
4055    1- vect_analyze_data_refs(loop/bb): call
4056       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4057       in the loop/bb and their dependences.
4058    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4059    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4060    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4061
4062 */
4063
4064 opt_result
4065 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4066 {
4067   struct loop *loop = NULL;
4068   unsigned int i;
4069   struct data_reference *dr;
4070   tree scalar_type;
4071
4072   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4073
4074   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4075     loop = LOOP_VINFO_LOOP (loop_vinfo);
4076
4077   /* Go through the data-refs, check that the analysis succeeded.  Update
4078      pointer from stmt_vec_info struct to DR and vectype.  */
4079
4080   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4081   FOR_EACH_VEC_ELT (datarefs, i, dr)
4082     {
4083       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4084       poly_uint64 vf;
4085
4086       gcc_assert (DR_REF (dr));
4087       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4088       gcc_assert (!stmt_info->dr_aux.dr);
4089       stmt_info->dr_aux.dr = dr;
4090       stmt_info->dr_aux.stmt = stmt_info;
4091
4092       /* Check that analysis of the data-ref succeeded.  */
4093       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4094           || !DR_STEP (dr))
4095         {
4096           bool maybe_gather
4097             = DR_IS_READ (dr)
4098               && !TREE_THIS_VOLATILE (DR_REF (dr))
4099               && (targetm.vectorize.builtin_gather != NULL
4100                   || supports_vec_gather_load_p ());
4101           bool maybe_scatter
4102             = DR_IS_WRITE (dr)
4103               && !TREE_THIS_VOLATILE (DR_REF (dr))
4104               && (targetm.vectorize.builtin_scatter != NULL
4105                   || supports_vec_scatter_store_p ());
4106
4107           /* If target supports vector gather loads or scatter stores,
4108              see if they can't be used.  */
4109           if (is_a <loop_vec_info> (vinfo)
4110               && !nested_in_vect_loop_p (loop, stmt_info))
4111             {
4112               if (maybe_gather || maybe_scatter)
4113                 {
4114                   if (maybe_gather)
4115                     gatherscatter = GATHER;
4116                   else
4117                     gatherscatter = SCATTER;
4118                 }
4119             }
4120
4121           if (gatherscatter == SG_NONE)
4122             {
4123               if (dump_enabled_p ())
4124                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4125                                  "not vectorized: data ref analysis "
4126                                  "failed %G", stmt_info->stmt);
4127               if (is_a <bb_vec_info> (vinfo))
4128                 {
4129                   /* In BB vectorization the ref can still participate
4130                      in dependence analysis, we just can't vectorize it.  */
4131                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4132                   continue;
4133                 }
4134               return opt_result::failure_at (stmt_info->stmt,
4135                                              "not vectorized:"
4136                                              " data ref analysis failed: %G",
4137                                              stmt_info->stmt);
4138             }
4139         }
4140
4141       /* See if this was detected as SIMD lane access.  */
4142       if (dr->aux == (void *)-1)
4143         {
4144           if (nested_in_vect_loop_p (loop, stmt_info))
4145             return opt_result::failure_at (stmt_info->stmt,
4146                                            "not vectorized:"
4147                                            " data ref analysis failed: %G",
4148                                            stmt_info->stmt);
4149           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4150         }
4151
4152       tree base = get_base_address (DR_REF (dr));
4153       if (base && VAR_P (base) && DECL_NONALIASED (base))
4154         {
4155           if (dump_enabled_p ())
4156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4157                              "not vectorized: base object not addressable "
4158                              "for stmt: %G", stmt_info->stmt);
4159           if (is_a <bb_vec_info> (vinfo))
4160             {
4161               /* In BB vectorization the ref can still participate
4162                  in dependence analysis, we just can't vectorize it.  */
4163               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4164               continue;
4165             }
4166           return opt_result::failure_at (stmt_info->stmt,
4167                                          "not vectorized: base object not"
4168                                          " addressable for stmt: %G",
4169                                          stmt_info->stmt);
4170         }
4171
4172       if (is_a <loop_vec_info> (vinfo)
4173           && DR_STEP (dr)
4174           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4175         {
4176           if (nested_in_vect_loop_p (loop, stmt_info))
4177             return opt_result::failure_at (stmt_info->stmt,
4178                                            "not vectorized:"
4179                                            "not suitable for strided load %G",
4180                                            stmt_info->stmt);
4181           STMT_VINFO_STRIDED_P (stmt_info) = true;
4182         }
4183
4184       /* Update DR field in stmt_vec_info struct.  */
4185
4186       /* If the dataref is in an inner-loop of the loop that is considered for
4187          for vectorization, we also want to analyze the access relative to
4188          the outer-loop (DR contains information only relative to the
4189          inner-most enclosing loop).  We do that by building a reference to the
4190          first location accessed by the inner-loop, and analyze it relative to
4191          the outer-loop.  */
4192       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4193         {
4194           /* Build a reference to the first location accessed by the
4195              inner loop: *(BASE + INIT + OFFSET).  By construction,
4196              this address must be invariant in the inner loop, so we
4197              can consider it as being used in the outer loop.  */
4198           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4199           tree offset = unshare_expr (DR_OFFSET (dr));
4200           tree init = unshare_expr (DR_INIT (dr));
4201           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4202                                           init, offset);
4203           tree init_addr = fold_build_pointer_plus (base, init_offset);
4204           tree init_ref = build_fold_indirect_ref (init_addr);
4205
4206           if (dump_enabled_p ())
4207             dump_printf_loc (MSG_NOTE, vect_location,
4208                              "analyze in outer loop: %T\n", init_ref);
4209
4210           opt_result res
4211             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4212                                     init_ref, loop, stmt_info->stmt);
4213           if (!res)
4214             /* dr_analyze_innermost already explained the failure.  */
4215             return res;
4216
4217           if (dump_enabled_p ())
4218             dump_printf_loc (MSG_NOTE, vect_location,
4219                              "\touter base_address: %T\n"
4220                              "\touter offset from base address: %T\n"
4221                              "\touter constant offset from base address: %T\n"
4222                              "\touter step: %T\n"
4223                              "\touter base alignment: %d\n\n"
4224                              "\touter base misalignment: %d\n"
4225                              "\touter offset alignment: %d\n"
4226                              "\touter step alignment: %d\n",
4227                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4228                              STMT_VINFO_DR_OFFSET (stmt_info),
4229                              STMT_VINFO_DR_INIT (stmt_info),
4230                              STMT_VINFO_DR_STEP (stmt_info),
4231                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4232                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4233                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4234                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4235         }
4236
4237       /* Set vectype for STMT.  */
4238       scalar_type = TREE_TYPE (DR_REF (dr));
4239       STMT_VINFO_VECTYPE (stmt_info)
4240         = get_vectype_for_scalar_type (scalar_type);
4241       if (!STMT_VINFO_VECTYPE (stmt_info))
4242         {
4243           if (dump_enabled_p ())
4244             {
4245               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246                                "not vectorized: no vectype for stmt: %G",
4247                                stmt_info->stmt);
4248               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4249               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4250                                  scalar_type);
4251               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4252             }
4253
4254           if (is_a <bb_vec_info> (vinfo))
4255             {
4256               /* No vector type is fine, the ref can still participate
4257                  in dependence analysis, we just can't vectorize it.  */
4258               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4259               continue;
4260             }
4261           return opt_result::failure_at (stmt_info->stmt,
4262                                          "not vectorized:"
4263                                          " no vectype for stmt: %G"
4264                                          " scalar_type: %T\n",
4265                                          stmt_info->stmt, scalar_type);
4266         }
4267       else
4268         {
4269           if (dump_enabled_p ())
4270             dump_printf_loc (MSG_NOTE, vect_location,
4271                              "got vectype for stmt: %G%T\n",
4272                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4273         }
4274
4275       /* Adjust the minimal vectorization factor according to the
4276          vector type.  */
4277       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4278       *min_vf = upper_bound (*min_vf, vf);
4279
4280       if (gatherscatter != SG_NONE)
4281         {
4282           gather_scatter_info gs_info;
4283           if (!vect_check_gather_scatter (stmt_info,
4284                                           as_a <loop_vec_info> (vinfo),
4285                                           &gs_info)
4286               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4287             return opt_result::failure_at
4288               (stmt_info->stmt,
4289                (gatherscatter == GATHER) ?
4290                "not vectorized: not suitable for gather load %G" :
4291                "not vectorized: not suitable for scatter store %G",
4292                stmt_info->stmt);
4293           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4294         }
4295     }
4296
4297   /* We used to stop processing and prune the list here.  Verify we no
4298      longer need to.  */
4299   gcc_assert (i == datarefs.length ());
4300
4301   return opt_result::success ();
4302 }
4303
4304
4305 /* Function vect_get_new_vect_var.
4306
4307    Returns a name for a new variable.  The current naming scheme appends the
4308    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4309    the name of vectorizer generated variables, and appends that to NAME if
4310    provided.  */
4311
4312 tree
4313 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4314 {
4315   const char *prefix;
4316   tree new_vect_var;
4317
4318   switch (var_kind)
4319   {
4320   case vect_simple_var:
4321     prefix = "vect";
4322     break;
4323   case vect_scalar_var:
4324     prefix = "stmp";
4325     break;
4326   case vect_mask_var:
4327     prefix = "mask";
4328     break;
4329   case vect_pointer_var:
4330     prefix = "vectp";
4331     break;
4332   default:
4333     gcc_unreachable ();
4334   }
4335
4336   if (name)
4337     {
4338       char* tmp = concat (prefix, "_", name, NULL);
4339       new_vect_var = create_tmp_reg (type, tmp);
4340       free (tmp);
4341     }
4342   else
4343     new_vect_var = create_tmp_reg (type, prefix);
4344
4345   return new_vect_var;
4346 }
4347
4348 /* Like vect_get_new_vect_var but return an SSA name.  */
4349
4350 tree
4351 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4352 {
4353   const char *prefix;
4354   tree new_vect_var;
4355
4356   switch (var_kind)
4357   {
4358   case vect_simple_var:
4359     prefix = "vect";
4360     break;
4361   case vect_scalar_var:
4362     prefix = "stmp";
4363     break;
4364   case vect_pointer_var:
4365     prefix = "vectp";
4366     break;
4367   default:
4368     gcc_unreachable ();
4369   }
4370
4371   if (name)
4372     {
4373       char* tmp = concat (prefix, "_", name, NULL);
4374       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4375       free (tmp);
4376     }
4377   else
4378     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4379
4380   return new_vect_var;
4381 }
4382
4383 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4384
4385 static void
4386 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4387 {
4388   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4389   int misalign = DR_MISALIGNMENT (dr_info);
4390   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4391     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4392   else
4393     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4394                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4395                             misalign);
4396 }
4397
4398 /* Function vect_create_addr_base_for_vector_ref.
4399
4400    Create an expression that computes the address of the first memory location
4401    that will be accessed for a data reference.
4402
4403    Input:
4404    STMT_INFO: The statement containing the data reference.
4405    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4406    OFFSET: Optional. If supplied, it is be added to the initial address.
4407    LOOP:    Specify relative to which loop-nest should the address be computed.
4408             For example, when the dataref is in an inner-loop nested in an
4409             outer-loop that is now being vectorized, LOOP can be either the
4410             outer-loop, or the inner-loop.  The first memory location accessed
4411             by the following dataref ('in' points to short):
4412
4413                 for (i=0; i<N; i++)
4414                    for (j=0; j<M; j++)
4415                      s += in[i+j]
4416
4417             is as follows:
4418             if LOOP=i_loop:     &in             (relative to i_loop)
4419             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4420    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4421             initial address.  Unlike OFFSET, which is number of elements to
4422             be added, BYTE_OFFSET is measured in bytes.
4423
4424    Output:
4425    1. Return an SSA_NAME whose value is the address of the memory location of
4426       the first vector of the data reference.
4427    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4428       these statement(s) which define the returned SSA_NAME.
4429
4430    FORNOW: We are only handling array accesses with step 1.  */
4431
4432 tree
4433 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4434                                       gimple_seq *new_stmt_list,
4435                                       tree offset,
4436                                       tree byte_offset)
4437 {
4438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4439   struct data_reference *dr = dr_info->dr;
4440   const char *base_name;
4441   tree addr_base;
4442   tree dest;
4443   gimple_seq seq = NULL;
4444   tree vect_ptr_type;
4445   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4446   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4447   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4448
4449   tree data_ref_base = unshare_expr (drb->base_address);
4450   tree base_offset = unshare_expr (drb->offset);
4451   tree init = unshare_expr (drb->init);
4452
4453   if (loop_vinfo)
4454     base_name = get_name (data_ref_base);
4455   else
4456     {
4457       base_offset = ssize_int (0);
4458       init = ssize_int (0);
4459       base_name = get_name (DR_REF (dr));
4460     }
4461
4462   /* Create base_offset */
4463   base_offset = size_binop (PLUS_EXPR,
4464                             fold_convert (sizetype, base_offset),
4465                             fold_convert (sizetype, init));
4466
4467   if (offset)
4468     {
4469       offset = fold_build2 (MULT_EXPR, sizetype,
4470                             fold_convert (sizetype, offset), step);
4471       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4472                                  base_offset, offset);
4473     }
4474   if (byte_offset)
4475     {
4476       byte_offset = fold_convert (sizetype, byte_offset);
4477       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4478                                  base_offset, byte_offset);
4479     }
4480
4481   /* base + base_offset */
4482   if (loop_vinfo)
4483     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4484   else
4485     {
4486       addr_base = build1 (ADDR_EXPR,
4487                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4488                           unshare_expr (DR_REF (dr)));
4489     }
4490
4491   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4492   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4493   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4494   gimple_seq_add_seq (new_stmt_list, seq);
4495
4496   if (DR_PTR_INFO (dr)
4497       && TREE_CODE (addr_base) == SSA_NAME
4498       && !SSA_NAME_PTR_INFO (addr_base))
4499     {
4500       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4501       if (offset || byte_offset)
4502         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4503     }
4504
4505   if (dump_enabled_p ())
4506     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4507
4508   return addr_base;
4509 }
4510
4511
4512 /* Function vect_create_data_ref_ptr.
4513
4514    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4515    location accessed in the loop by STMT_INFO, along with the def-use update
4516    chain to appropriately advance the pointer through the loop iterations.
4517    Also set aliasing information for the pointer.  This pointer is used by
4518    the callers to this function to create a memory reference expression for
4519    vector load/store access.
4520
4521    Input:
4522    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4523          GIMPLE_ASSIGN <name, data-ref> or
4524          GIMPLE_ASSIGN <data-ref, name>.
4525    2. AGGR_TYPE: the type of the reference, which should be either a vector
4526         or an array.
4527    3. AT_LOOP: the loop where the vector memref is to be created.
4528    4. OFFSET (optional): an offset to be added to the initial address accessed
4529         by the data-ref in STMT_INFO.
4530    5. BSI: location where the new stmts are to be placed if there is no loop
4531    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4532         pointing to the initial address.
4533    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4534         to the initial address accessed by the data-ref in STMT_INFO.  This is
4535         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4536         in bytes.
4537    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4538         to the IV during each iteration of the loop.  NULL says to move
4539         by one copy of AGGR_TYPE up or down, depending on the step of the
4540         data reference.
4541
4542    Output:
4543    1. Declare a new ptr to vector_type, and have it point to the base of the
4544       data reference (initial addressed accessed by the data reference).
4545       For example, for vector of type V8HI, the following code is generated:
4546
4547       v8hi *ap;
4548       ap = (v8hi *)initial_address;
4549
4550       if OFFSET is not supplied:
4551          initial_address = &a[init];
4552       if OFFSET is supplied:
4553          initial_address = &a[init + OFFSET];
4554       if BYTE_OFFSET is supplied:
4555          initial_address = &a[init] + BYTE_OFFSET;
4556
4557       Return the initial_address in INITIAL_ADDRESS.
4558
4559    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4560       update the pointer in each iteration of the loop.
4561
4562       Return the increment stmt that updates the pointer in PTR_INCR.
4563
4564    3. Return the pointer.  */
4565
4566 tree
4567 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4568                           struct loop *at_loop, tree offset,
4569                           tree *initial_address, gimple_stmt_iterator *gsi,
4570                           gimple **ptr_incr, bool only_init,
4571                           tree byte_offset, tree iv_step)
4572 {
4573   const char *base_name;
4574   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4575   struct loop *loop = NULL;
4576   bool nested_in_vect_loop = false;
4577   struct loop *containing_loop = NULL;
4578   tree aggr_ptr_type;
4579   tree aggr_ptr;
4580   tree new_temp;
4581   gimple_seq new_stmt_list = NULL;
4582   edge pe = NULL;
4583   basic_block new_bb;
4584   tree aggr_ptr_init;
4585   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4586   struct data_reference *dr = dr_info->dr;
4587   tree aptr;
4588   gimple_stmt_iterator incr_gsi;
4589   bool insert_after;
4590   tree indx_before_incr, indx_after_incr;
4591   gimple *incr;
4592   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4593
4594   gcc_assert (iv_step != NULL_TREE
4595               || TREE_CODE (aggr_type) == ARRAY_TYPE
4596               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4597
4598   if (loop_vinfo)
4599     {
4600       loop = LOOP_VINFO_LOOP (loop_vinfo);
4601       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4602       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4603       pe = loop_preheader_edge (loop);
4604     }
4605   else
4606     {
4607       gcc_assert (bb_vinfo);
4608       only_init = true;
4609       *ptr_incr = NULL;
4610     }
4611
4612   /* Create an expression for the first address accessed by this load
4613      in LOOP.  */
4614   base_name = get_name (DR_BASE_ADDRESS (dr));
4615
4616   if (dump_enabled_p ())
4617     {
4618       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4619       dump_printf_loc (MSG_NOTE, vect_location,
4620                        "create %s-pointer variable to type: %T",
4621                        get_tree_code_name (TREE_CODE (aggr_type)),
4622                        aggr_type);
4623       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4624         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4625       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4626         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4627       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4628         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4629       else
4630         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4631       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4632     }
4633
4634   /* (1) Create the new aggregate-pointer variable.
4635      Vector and array types inherit the alias set of their component
4636      type by default so we need to use a ref-all pointer if the data
4637      reference does not conflict with the created aggregated data
4638      reference because it is not addressable.  */
4639   bool need_ref_all = false;
4640   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4641                               get_alias_set (DR_REF (dr))))
4642     need_ref_all = true;
4643   /* Likewise for any of the data references in the stmt group.  */
4644   else if (DR_GROUP_SIZE (stmt_info) > 1)
4645     {
4646       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4647       do
4648         {
4649           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4650           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4651                                       get_alias_set (DR_REF (sdr))))
4652             {
4653               need_ref_all = true;
4654               break;
4655             }
4656           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4657         }
4658       while (sinfo);
4659     }
4660   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4661                                                need_ref_all);
4662   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4663
4664
4665   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4666      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4667      def-use update cycles for the pointer: one relative to the outer-loop
4668      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4669      to the inner-loop (which is the inner-most loop containing the dataref),
4670      and this is done be step (5) below.
4671
4672      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4673      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4674      redundant.  Steps (3),(4) create the following:
4675
4676         vp0 = &base_addr;
4677         LOOP:   vp1 = phi(vp0,vp2)
4678                 ...
4679                 ...
4680                 vp2 = vp1 + step
4681                 goto LOOP
4682
4683      If there is an inner-loop nested in loop, then step (5) will also be
4684      applied, and an additional update in the inner-loop will be created:
4685
4686         vp0 = &base_addr;
4687         LOOP:   vp1 = phi(vp0,vp2)
4688                 ...
4689         inner:     vp3 = phi(vp1,vp4)
4690                    vp4 = vp3 + inner_step
4691                    if () goto inner
4692                 ...
4693                 vp2 = vp1 + step
4694                 if () goto LOOP   */
4695
4696   /* (2) Calculate the initial address of the aggregate-pointer, and set
4697      the aggregate-pointer to point to it before the loop.  */
4698
4699   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4700
4701   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4702                                                    offset, byte_offset);
4703   if (new_stmt_list)
4704     {
4705       if (pe)
4706         {
4707           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4708           gcc_assert (!new_bb);
4709         }
4710       else
4711         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4712     }
4713
4714   *initial_address = new_temp;
4715   aggr_ptr_init = new_temp;
4716
4717   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4718      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4719      inner-loop nested in LOOP (during outer-loop vectorization).  */
4720
4721   /* No update in loop is required.  */
4722   if (only_init && (!loop_vinfo || at_loop == loop))
4723     aptr = aggr_ptr_init;
4724   else
4725     {
4726       /* Accesses to invariant addresses should be handled specially
4727          by the caller.  */
4728       tree step = vect_dr_behavior (dr_info)->step;
4729       gcc_assert (!integer_zerop (step));
4730
4731       if (iv_step == NULL_TREE)
4732         {
4733           /* The step of the aggregate pointer is the type size,
4734              negated for downward accesses.  */
4735           iv_step = TYPE_SIZE_UNIT (aggr_type);
4736           if (tree_int_cst_sgn (step) == -1)
4737             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4738         }
4739
4740       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4741
4742       create_iv (aggr_ptr_init,
4743                  fold_convert (aggr_ptr_type, iv_step),
4744                  aggr_ptr, loop, &incr_gsi, insert_after,
4745                  &indx_before_incr, &indx_after_incr);
4746       incr = gsi_stmt (incr_gsi);
4747       loop_vinfo->add_stmt (incr);
4748
4749       /* Copy the points-to information if it exists. */
4750       if (DR_PTR_INFO (dr))
4751         {
4752           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4753           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4754         }
4755       if (ptr_incr)
4756         *ptr_incr = incr;
4757
4758       aptr = indx_before_incr;
4759     }
4760
4761   if (!nested_in_vect_loop || only_init)
4762     return aptr;
4763
4764
4765   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4766      nested in LOOP, if exists.  */
4767
4768   gcc_assert (nested_in_vect_loop);
4769   if (!only_init)
4770     {
4771       standard_iv_increment_position (containing_loop, &incr_gsi,
4772                                       &insert_after);
4773       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4774                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4775                  &indx_after_incr);
4776       incr = gsi_stmt (incr_gsi);
4777       loop_vinfo->add_stmt (incr);
4778
4779       /* Copy the points-to information if it exists. */
4780       if (DR_PTR_INFO (dr))
4781         {
4782           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4783           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4784         }
4785       if (ptr_incr)
4786         *ptr_incr = incr;
4787
4788       return indx_before_incr;
4789     }
4790   else
4791     gcc_unreachable ();
4792 }
4793
4794
4795 /* Function bump_vector_ptr
4796
4797    Increment a pointer (to a vector type) by vector-size. If requested,
4798    i.e. if PTR-INCR is given, then also connect the new increment stmt
4799    to the existing def-use update-chain of the pointer, by modifying
4800    the PTR_INCR as illustrated below:
4801
4802    The pointer def-use update-chain before this function:
4803                         DATAREF_PTR = phi (p_0, p_2)
4804                         ....
4805         PTR_INCR:       p_2 = DATAREF_PTR + step
4806
4807    The pointer def-use update-chain after this function:
4808                         DATAREF_PTR = phi (p_0, p_2)
4809                         ....
4810                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4811                         ....
4812         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4813
4814    Input:
4815    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4816                  in the loop.
4817    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4818               the loop.  The increment amount across iterations is expected
4819               to be vector_size.
4820    BSI - location where the new update stmt is to be placed.
4821    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4822    BUMP - optional. The offset by which to bump the pointer. If not given,
4823           the offset is assumed to be vector_size.
4824
4825    Output: Return NEW_DATAREF_PTR as illustrated above.
4826
4827 */
4828
4829 tree
4830 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4831                  stmt_vec_info stmt_info, tree bump)
4832 {
4833   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4834   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4835   tree update = TYPE_SIZE_UNIT (vectype);
4836   gassign *incr_stmt;
4837   ssa_op_iter iter;
4838   use_operand_p use_p;
4839   tree new_dataref_ptr;
4840
4841   if (bump)
4842     update = bump;
4843
4844   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4845     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4846   else
4847     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4848   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4849                                    dataref_ptr, update);
4850   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4851
4852   /* Copy the points-to information if it exists. */
4853   if (DR_PTR_INFO (dr))
4854     {
4855       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4856       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4857     }
4858
4859   if (!ptr_incr)
4860     return new_dataref_ptr;
4861
4862   /* Update the vector-pointer's cross-iteration increment.  */
4863   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4864     {
4865       tree use = USE_FROM_PTR (use_p);
4866
4867       if (use == dataref_ptr)
4868         SET_USE (use_p, new_dataref_ptr);
4869       else
4870         gcc_assert (operand_equal_p (use, update, 0));
4871     }
4872
4873   return new_dataref_ptr;
4874 }
4875
4876
4877 /* Copy memory reference info such as base/clique from the SRC reference
4878    to the DEST MEM_REF.  */
4879
4880 void
4881 vect_copy_ref_info (tree dest, tree src)
4882 {
4883   if (TREE_CODE (dest) != MEM_REF)
4884     return;
4885
4886   tree src_base = src;
4887   while (handled_component_p (src_base))
4888     src_base = TREE_OPERAND (src_base, 0);
4889   if (TREE_CODE (src_base) != MEM_REF
4890       && TREE_CODE (src_base) != TARGET_MEM_REF)
4891     return;
4892
4893   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4894   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4895 }
4896
4897
4898 /* Function vect_create_destination_var.
4899
4900    Create a new temporary of type VECTYPE.  */
4901
4902 tree
4903 vect_create_destination_var (tree scalar_dest, tree vectype)
4904 {
4905   tree vec_dest;
4906   const char *name;
4907   char *new_name;
4908   tree type;
4909   enum vect_var_kind kind;
4910
4911   kind = vectype
4912     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4913     ? vect_mask_var
4914     : vect_simple_var
4915     : vect_scalar_var;
4916   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4917
4918   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4919
4920   name = get_name (scalar_dest);
4921   if (name)
4922     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4923   else
4924     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4925   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4926   free (new_name);
4927
4928   return vec_dest;
4929 }
4930
4931 /* Function vect_grouped_store_supported.
4932
4933    Returns TRUE if interleave high and interleave low permutations
4934    are supported, and FALSE otherwise.  */
4935
4936 bool
4937 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4938 {
4939   machine_mode mode = TYPE_MODE (vectype);
4940
4941   /* vect_permute_store_chain requires the group size to be equal to 3 or
4942      be a power of two.  */
4943   if (count != 3 && exact_log2 (count) == -1)
4944     {
4945       if (dump_enabled_p ())
4946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947                          "the size of the group of accesses"
4948                          " is not a power of 2 or not eqaul to 3\n");
4949       return false;
4950     }
4951
4952   /* Check that the permutation is supported.  */
4953   if (VECTOR_MODE_P (mode))
4954     {
4955       unsigned int i;
4956       if (count == 3)
4957         {
4958           unsigned int j0 = 0, j1 = 0, j2 = 0;
4959           unsigned int i, j;
4960
4961           unsigned int nelt;
4962           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4963             {
4964               if (dump_enabled_p ())
4965                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966                                  "cannot handle groups of 3 stores for"
4967                                  " variable-length vectors\n");
4968               return false;
4969             }
4970
4971           vec_perm_builder sel (nelt, nelt, 1);
4972           sel.quick_grow (nelt);
4973           vec_perm_indices indices;
4974           for (j = 0; j < 3; j++)
4975             {
4976               int nelt0 = ((3 - j) * nelt) % 3;
4977               int nelt1 = ((3 - j) * nelt + 1) % 3;
4978               int nelt2 = ((3 - j) * nelt + 2) % 3;
4979               for (i = 0; i < nelt; i++)
4980                 {
4981                   if (3 * i + nelt0 < nelt)
4982                     sel[3 * i + nelt0] = j0++;
4983                   if (3 * i + nelt1 < nelt)
4984                     sel[3 * i + nelt1] = nelt + j1++;
4985                   if (3 * i + nelt2 < nelt)
4986                     sel[3 * i + nelt2] = 0;
4987                 }
4988               indices.new_vector (sel, 2, nelt);
4989               if (!can_vec_perm_const_p (mode, indices))
4990                 {
4991                   if (dump_enabled_p ())
4992                     dump_printf (MSG_MISSED_OPTIMIZATION,
4993                                  "permutation op not supported by target.\n");
4994                   return false;
4995                 }
4996
4997               for (i = 0; i < nelt; i++)
4998                 {
4999                   if (3 * i + nelt0 < nelt)
5000                     sel[3 * i + nelt0] = 3 * i + nelt0;
5001                   if (3 * i + nelt1 < nelt)
5002                     sel[3 * i + nelt1] = 3 * i + nelt1;
5003                   if (3 * i + nelt2 < nelt)
5004                     sel[3 * i + nelt2] = nelt + j2++;
5005                 }
5006               indices.new_vector (sel, 2, nelt);
5007               if (!can_vec_perm_const_p (mode, indices))
5008                 {
5009                   if (dump_enabled_p ())
5010                     dump_printf (MSG_MISSED_OPTIMIZATION,
5011                                  "permutation op not supported by target.\n");
5012                   return false;
5013                 }
5014             }
5015           return true;
5016         }
5017       else
5018         {
5019           /* If length is not equal to 3 then only power of 2 is supported.  */
5020           gcc_assert (pow2p_hwi (count));
5021           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5022
5023           /* The encoding has 2 interleaved stepped patterns.  */
5024           vec_perm_builder sel (nelt, 2, 3);
5025           sel.quick_grow (6);
5026           for (i = 0; i < 3; i++)
5027             {
5028               sel[i * 2] = i;
5029               sel[i * 2 + 1] = i + nelt;
5030             }
5031           vec_perm_indices indices (sel, 2, nelt);
5032           if (can_vec_perm_const_p (mode, indices))
5033             {
5034               for (i = 0; i < 6; i++)
5035                 sel[i] += exact_div (nelt, 2);
5036               indices.new_vector (sel, 2, nelt);
5037               if (can_vec_perm_const_p (mode, indices))
5038                 return true;
5039             }
5040         }
5041     }
5042
5043   if (dump_enabled_p ())
5044     dump_printf (MSG_MISSED_OPTIMIZATION,
5045                  "permutation op not supported by target.\n");
5046   return false;
5047 }
5048
5049
5050 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5051    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5052
5053 bool
5054 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5055                             bool masked_p)
5056 {
5057   if (masked_p)
5058     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5059                                          vec_mask_store_lanes_optab,
5060                                          vectype, count);
5061   else
5062     return vect_lanes_optab_supported_p ("vec_store_lanes",
5063                                          vec_store_lanes_optab,
5064                                          vectype, count);
5065 }
5066
5067
5068 /* Function vect_permute_store_chain.
5069
5070    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5071    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5072    the data correctly for the stores.  Return the final references for stores
5073    in RESULT_CHAIN.
5074
5075    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5076    The input is 4 vectors each containing 8 elements.  We assign a number to
5077    each element, the input sequence is:
5078
5079    1st vec:   0  1  2  3  4  5  6  7
5080    2nd vec:   8  9 10 11 12 13 14 15
5081    3rd vec:  16 17 18 19 20 21 22 23
5082    4th vec:  24 25 26 27 28 29 30 31
5083
5084    The output sequence should be:
5085
5086    1st vec:  0  8 16 24  1  9 17 25
5087    2nd vec:  2 10 18 26  3 11 19 27
5088    3rd vec:  4 12 20 28  5 13 21 30
5089    4th vec:  6 14 22 30  7 15 23 31
5090
5091    i.e., we interleave the contents of the four vectors in their order.
5092
5093    We use interleave_high/low instructions to create such output.  The input of
5094    each interleave_high/low operation is two vectors:
5095    1st vec    2nd vec
5096    0 1 2 3    4 5 6 7
5097    the even elements of the result vector are obtained left-to-right from the
5098    high/low elements of the first vector.  The odd elements of the result are
5099    obtained left-to-right from the high/low elements of the second vector.
5100    The output of interleave_high will be:   0 4 1 5
5101    and of interleave_low:                   2 6 3 7
5102
5103
5104    The permutation is done in log LENGTH stages.  In each stage interleave_high
5105    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5106    where the first argument is taken from the first half of DR_CHAIN and the
5107    second argument from it's second half.
5108    In our example,
5109
5110    I1: interleave_high (1st vec, 3rd vec)
5111    I2: interleave_low (1st vec, 3rd vec)
5112    I3: interleave_high (2nd vec, 4th vec)
5113    I4: interleave_low (2nd vec, 4th vec)
5114
5115    The output for the first stage is:
5116
5117    I1:  0 16  1 17  2 18  3 19
5118    I2:  4 20  5 21  6 22  7 23
5119    I3:  8 24  9 25 10 26 11 27
5120    I4: 12 28 13 29 14 30 15 31
5121
5122    The output of the second stage, i.e. the final result is:
5123
5124    I1:  0  8 16 24  1  9 17 25
5125    I2:  2 10 18 26  3 11 19 27
5126    I3:  4 12 20 28  5 13 21 30
5127    I4:  6 14 22 30  7 15 23 31.  */
5128
5129 void
5130 vect_permute_store_chain (vec<tree> dr_chain,
5131                           unsigned int length,
5132                           stmt_vec_info stmt_info,
5133                           gimple_stmt_iterator *gsi,
5134                           vec<tree> *result_chain)
5135 {
5136   tree vect1, vect2, high, low;
5137   gimple *perm_stmt;
5138   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5139   tree perm_mask_low, perm_mask_high;
5140   tree data_ref;
5141   tree perm3_mask_low, perm3_mask_high;
5142   unsigned int i, j, n, log_length = exact_log2 (length);
5143
5144   result_chain->quick_grow (length);
5145   memcpy (result_chain->address (), dr_chain.address (),
5146           length * sizeof (tree));
5147
5148   if (length == 3)
5149     {
5150       /* vect_grouped_store_supported ensures that this is constant.  */
5151       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5152       unsigned int j0 = 0, j1 = 0, j2 = 0;
5153
5154       vec_perm_builder sel (nelt, nelt, 1);
5155       sel.quick_grow (nelt);
5156       vec_perm_indices indices;
5157       for (j = 0; j < 3; j++)
5158         {
5159           int nelt0 = ((3 - j) * nelt) % 3;
5160           int nelt1 = ((3 - j) * nelt + 1) % 3;
5161           int nelt2 = ((3 - j) * nelt + 2) % 3;
5162
5163           for (i = 0; i < nelt; i++)
5164             {
5165               if (3 * i + nelt0 < nelt)
5166                 sel[3 * i + nelt0] = j0++;
5167               if (3 * i + nelt1 < nelt)
5168                 sel[3 * i + nelt1] = nelt + j1++;
5169               if (3 * i + nelt2 < nelt)
5170                 sel[3 * i + nelt2] = 0;
5171             }
5172           indices.new_vector (sel, 2, nelt);
5173           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5174
5175           for (i = 0; i < nelt; i++)
5176             {
5177               if (3 * i + nelt0 < nelt)
5178                 sel[3 * i + nelt0] = 3 * i + nelt0;
5179               if (3 * i + nelt1 < nelt)
5180                 sel[3 * i + nelt1] = 3 * i + nelt1;
5181               if (3 * i + nelt2 < nelt)
5182                 sel[3 * i + nelt2] = nelt + j2++;
5183             }
5184           indices.new_vector (sel, 2, nelt);
5185           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5186
5187           vect1 = dr_chain[0];
5188           vect2 = dr_chain[1];
5189
5190           /* Create interleaving stmt:
5191              low = VEC_PERM_EXPR <vect1, vect2,
5192                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5193                                    j + 2, nelt + j + 2, *, ...}>  */
5194           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5195           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5196                                            vect2, perm3_mask_low);
5197           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5198
5199           vect1 = data_ref;
5200           vect2 = dr_chain[2];
5201           /* Create interleaving stmt:
5202              low = VEC_PERM_EXPR <vect1, vect2,
5203                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5204                                    6, 7, nelt + j + 2, ...}>  */
5205           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5206           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5207                                            vect2, perm3_mask_high);
5208           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5209           (*result_chain)[j] = data_ref;
5210         }
5211     }
5212   else
5213     {
5214       /* If length is not equal to 3 then only power of 2 is supported.  */
5215       gcc_assert (pow2p_hwi (length));
5216
5217       /* The encoding has 2 interleaved stepped patterns.  */
5218       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5219       vec_perm_builder sel (nelt, 2, 3);
5220       sel.quick_grow (6);
5221       for (i = 0; i < 3; i++)
5222         {
5223           sel[i * 2] = i;
5224           sel[i * 2 + 1] = i + nelt;
5225         }
5226         vec_perm_indices indices (sel, 2, nelt);
5227         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5228
5229         for (i = 0; i < 6; i++)
5230           sel[i] += exact_div (nelt, 2);
5231         indices.new_vector (sel, 2, nelt);
5232         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5233
5234         for (i = 0, n = log_length; i < n; i++)
5235           {
5236             for (j = 0; j < length/2; j++)
5237               {
5238                 vect1 = dr_chain[j];
5239                 vect2 = dr_chain[j+length/2];
5240
5241                 /* Create interleaving stmt:
5242                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5243                                                         ...}>  */
5244                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5245                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5246                                                  vect2, perm_mask_high);
5247                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5248                 (*result_chain)[2*j] = high;
5249
5250                 /* Create interleaving stmt:
5251                    low = VEC_PERM_EXPR <vect1, vect2,
5252                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5253                                          ...}>  */
5254                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5255                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5256                                                  vect2, perm_mask_low);
5257                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5258                 (*result_chain)[2*j+1] = low;
5259               }
5260             memcpy (dr_chain.address (), result_chain->address (),
5261                     length * sizeof (tree));
5262           }
5263     }
5264 }
5265
5266 /* Function vect_setup_realignment
5267
5268    This function is called when vectorizing an unaligned load using
5269    the dr_explicit_realign[_optimized] scheme.
5270    This function generates the following code at the loop prolog:
5271
5272       p = initial_addr;
5273    x  msq_init = *(floor(p));   # prolog load
5274       realignment_token = call target_builtin;
5275     loop:
5276    x  msq = phi (msq_init, ---)
5277
5278    The stmts marked with x are generated only for the case of
5279    dr_explicit_realign_optimized.
5280
5281    The code above sets up a new (vector) pointer, pointing to the first
5282    location accessed by STMT_INFO, and a "floor-aligned" load using that
5283    pointer.  It also generates code to compute the "realignment-token"
5284    (if the relevant target hook was defined), and creates a phi-node at the
5285    loop-header bb whose arguments are the result of the prolog-load (created
5286    by this function) and the result of a load that takes place in the loop
5287    (to be created by the caller to this function).
5288
5289    For the case of dr_explicit_realign_optimized:
5290    The caller to this function uses the phi-result (msq) to create the
5291    realignment code inside the loop, and sets up the missing phi argument,
5292    as follows:
5293     loop:
5294       msq = phi (msq_init, lsq)
5295       lsq = *(floor(p'));        # load in loop
5296       result = realign_load (msq, lsq, realignment_token);
5297
5298    For the case of dr_explicit_realign:
5299     loop:
5300       msq = *(floor(p));        # load in loop
5301       p' = p + (VS-1);
5302       lsq = *(floor(p'));       # load in loop
5303       result = realign_load (msq, lsq, realignment_token);
5304
5305    Input:
5306    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5307                a memory location that may be unaligned.
5308    BSI - place where new code is to be inserted.
5309    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5310                               is used.
5311
5312    Output:
5313    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5314                        target hook, if defined.
5315    Return value - the result of the loop-header phi node.  */
5316
5317 tree
5318 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5319                         tree *realignment_token,
5320                         enum dr_alignment_support alignment_support_scheme,
5321                         tree init_addr,
5322                         struct loop **at_loop)
5323 {
5324   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5325   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5326   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5327   struct data_reference *dr = dr_info->dr;
5328   struct loop *loop = NULL;
5329   edge pe = NULL;
5330   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5331   tree vec_dest;
5332   gimple *inc;
5333   tree ptr;
5334   tree data_ref;
5335   basic_block new_bb;
5336   tree msq_init = NULL_TREE;
5337   tree new_temp;
5338   gphi *phi_stmt;
5339   tree msq = NULL_TREE;
5340   gimple_seq stmts = NULL;
5341   bool compute_in_loop = false;
5342   bool nested_in_vect_loop = false;
5343   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5344   struct loop *loop_for_initial_load = NULL;
5345
5346   if (loop_vinfo)
5347     {
5348       loop = LOOP_VINFO_LOOP (loop_vinfo);
5349       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5350     }
5351
5352   gcc_assert (alignment_support_scheme == dr_explicit_realign
5353               || alignment_support_scheme == dr_explicit_realign_optimized);
5354
5355   /* We need to generate three things:
5356      1. the misalignment computation
5357      2. the extra vector load (for the optimized realignment scheme).
5358      3. the phi node for the two vectors from which the realignment is
5359       done (for the optimized realignment scheme).  */
5360
5361   /* 1. Determine where to generate the misalignment computation.
5362
5363      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5364      calculation will be generated by this function, outside the loop (in the
5365      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5366      caller, inside the loop.
5367
5368      Background: If the misalignment remains fixed throughout the iterations of
5369      the loop, then both realignment schemes are applicable, and also the
5370      misalignment computation can be done outside LOOP.  This is because we are
5371      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5372      are a multiple of VS (the Vector Size), and therefore the misalignment in
5373      different vectorized LOOP iterations is always the same.
5374      The problem arises only if the memory access is in an inner-loop nested
5375      inside LOOP, which is now being vectorized using outer-loop vectorization.
5376      This is the only case when the misalignment of the memory access may not
5377      remain fixed throughout the iterations of the inner-loop (as explained in
5378      detail in vect_supportable_dr_alignment).  In this case, not only is the
5379      optimized realignment scheme not applicable, but also the misalignment
5380      computation (and generation of the realignment token that is passed to
5381      REALIGN_LOAD) have to be done inside the loop.
5382
5383      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5384      or not, which in turn determines if the misalignment is computed inside
5385      the inner-loop, or outside LOOP.  */
5386
5387   if (init_addr != NULL_TREE || !loop_vinfo)
5388     {
5389       compute_in_loop = true;
5390       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5391     }
5392
5393
5394   /* 2. Determine where to generate the extra vector load.
5395
5396      For the optimized realignment scheme, instead of generating two vector
5397      loads in each iteration, we generate a single extra vector load in the
5398      preheader of the loop, and in each iteration reuse the result of the
5399      vector load from the previous iteration.  In case the memory access is in
5400      an inner-loop nested inside LOOP, which is now being vectorized using
5401      outer-loop vectorization, we need to determine whether this initial vector
5402      load should be generated at the preheader of the inner-loop, or can be
5403      generated at the preheader of LOOP.  If the memory access has no evolution
5404      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5405      to be generated inside LOOP (in the preheader of the inner-loop).  */
5406
5407   if (nested_in_vect_loop)
5408     {
5409       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5410       bool invariant_in_outerloop =
5411             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5412       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5413     }
5414   else
5415     loop_for_initial_load = loop;
5416   if (at_loop)
5417     *at_loop = loop_for_initial_load;
5418
5419   if (loop_for_initial_load)
5420     pe = loop_preheader_edge (loop_for_initial_load);
5421
5422   /* 3. For the case of the optimized realignment, create the first vector
5423       load at the loop preheader.  */
5424
5425   if (alignment_support_scheme == dr_explicit_realign_optimized)
5426     {
5427       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5428       gassign *new_stmt;
5429
5430       gcc_assert (!compute_in_loop);
5431       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5432       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5433                                       loop_for_initial_load, NULL_TREE,
5434                                       &init_addr, NULL, &inc, true);
5435       if (TREE_CODE (ptr) == SSA_NAME)
5436         new_temp = copy_ssa_name (ptr);
5437       else
5438         new_temp = make_ssa_name (TREE_TYPE (ptr));
5439       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5440       tree type = TREE_TYPE (ptr);
5441       new_stmt = gimple_build_assign
5442                    (new_temp, BIT_AND_EXPR, ptr,
5443                     fold_build2 (MINUS_EXPR, type,
5444                                  build_int_cst (type, 0),
5445                                  build_int_cst (type, align)));
5446       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5447       gcc_assert (!new_bb);
5448       data_ref
5449         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5450                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5451       vect_copy_ref_info (data_ref, DR_REF (dr));
5452       new_stmt = gimple_build_assign (vec_dest, data_ref);
5453       new_temp = make_ssa_name (vec_dest, new_stmt);
5454       gimple_assign_set_lhs (new_stmt, new_temp);
5455       if (pe)
5456         {
5457           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5458           gcc_assert (!new_bb);
5459         }
5460       else
5461          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5462
5463       msq_init = gimple_assign_lhs (new_stmt);
5464     }
5465
5466   /* 4. Create realignment token using a target builtin, if available.
5467       It is done either inside the containing loop, or before LOOP (as
5468       determined above).  */
5469
5470   if (targetm.vectorize.builtin_mask_for_load)
5471     {
5472       gcall *new_stmt;
5473       tree builtin_decl;
5474
5475       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5476       if (!init_addr)
5477         {
5478           /* Generate the INIT_ADDR computation outside LOOP.  */
5479           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5480                                                             NULL_TREE);
5481           if (loop)
5482             {
5483               pe = loop_preheader_edge (loop);
5484               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5485               gcc_assert (!new_bb);
5486             }
5487           else
5488              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5489         }
5490
5491       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5492       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5493       vec_dest =
5494         vect_create_destination_var (scalar_dest,
5495                                      gimple_call_return_type (new_stmt));
5496       new_temp = make_ssa_name (vec_dest, new_stmt);
5497       gimple_call_set_lhs (new_stmt, new_temp);
5498
5499       if (compute_in_loop)
5500         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5501       else
5502         {
5503           /* Generate the misalignment computation outside LOOP.  */
5504           pe = loop_preheader_edge (loop);
5505           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5506           gcc_assert (!new_bb);
5507         }
5508
5509       *realignment_token = gimple_call_lhs (new_stmt);
5510
5511       /* The result of the CALL_EXPR to this builtin is determined from
5512          the value of the parameter and no global variables are touched
5513          which makes the builtin a "const" function.  Requiring the
5514          builtin to have the "const" attribute makes it unnecessary
5515          to call mark_call_clobbered.  */
5516       gcc_assert (TREE_READONLY (builtin_decl));
5517     }
5518
5519   if (alignment_support_scheme == dr_explicit_realign)
5520     return msq;
5521
5522   gcc_assert (!compute_in_loop);
5523   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5524
5525
5526   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5527
5528   pe = loop_preheader_edge (containing_loop);
5529   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5530   msq = make_ssa_name (vec_dest);
5531   phi_stmt = create_phi_node (msq, containing_loop->header);
5532   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5533
5534   return msq;
5535 }
5536
5537
5538 /* Function vect_grouped_load_supported.
5539
5540    COUNT is the size of the load group (the number of statements plus the
5541    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5542    only one statement, with a gap of COUNT - 1.
5543
5544    Returns true if a suitable permute exists.  */
5545
5546 bool
5547 vect_grouped_load_supported (tree vectype, bool single_element_p,
5548                              unsigned HOST_WIDE_INT count)
5549 {
5550   machine_mode mode = TYPE_MODE (vectype);
5551
5552   /* If this is single-element interleaving with an element distance
5553      that leaves unused vector loads around punt - we at least create
5554      very sub-optimal code in that case (and blow up memory,
5555      see PR65518).  */
5556   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5557     {
5558       if (dump_enabled_p ())
5559         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5560                          "single-element interleaving not supported "
5561                          "for not adjacent vector loads\n");
5562       return false;
5563     }
5564
5565   /* vect_permute_load_chain requires the group size to be equal to 3 or
5566      be a power of two.  */
5567   if (count != 3 && exact_log2 (count) == -1)
5568     {
5569       if (dump_enabled_p ())
5570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5571                          "the size of the group of accesses"
5572                          " is not a power of 2 or not equal to 3\n");
5573       return false;
5574     }
5575
5576   /* Check that the permutation is supported.  */
5577   if (VECTOR_MODE_P (mode))
5578     {
5579       unsigned int i, j;
5580       if (count == 3)
5581         {
5582           unsigned int nelt;
5583           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5584             {
5585               if (dump_enabled_p ())
5586                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5587                                  "cannot handle groups of 3 loads for"
5588                                  " variable-length vectors\n");
5589               return false;
5590             }
5591
5592           vec_perm_builder sel (nelt, nelt, 1);
5593           sel.quick_grow (nelt);
5594           vec_perm_indices indices;
5595           unsigned int k;
5596           for (k = 0; k < 3; k++)
5597             {
5598               for (i = 0; i < nelt; i++)
5599                 if (3 * i + k < 2 * nelt)
5600                   sel[i] = 3 * i + k;
5601                 else
5602                   sel[i] = 0;
5603               indices.new_vector (sel, 2, nelt);
5604               if (!can_vec_perm_const_p (mode, indices))
5605                 {
5606                   if (dump_enabled_p ())
5607                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608                                      "shuffle of 3 loads is not supported by"
5609                                      " target\n");
5610                   return false;
5611                 }
5612               for (i = 0, j = 0; i < nelt; i++)
5613                 if (3 * i + k < 2 * nelt)
5614                   sel[i] = i;
5615                 else
5616                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5617               indices.new_vector (sel, 2, nelt);
5618               if (!can_vec_perm_const_p (mode, indices))
5619                 {
5620                   if (dump_enabled_p ())
5621                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622                                      "shuffle of 3 loads is not supported by"
5623                                      " target\n");
5624                   return false;
5625                 }
5626             }
5627           return true;
5628         }
5629       else
5630         {
5631           /* If length is not equal to 3 then only power of 2 is supported.  */
5632           gcc_assert (pow2p_hwi (count));
5633           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5634
5635           /* The encoding has a single stepped pattern.  */
5636           vec_perm_builder sel (nelt, 1, 3);
5637           sel.quick_grow (3);
5638           for (i = 0; i < 3; i++)
5639             sel[i] = i * 2;
5640           vec_perm_indices indices (sel, 2, nelt);
5641           if (can_vec_perm_const_p (mode, indices))
5642             {
5643               for (i = 0; i < 3; i++)
5644                 sel[i] = i * 2 + 1;
5645               indices.new_vector (sel, 2, nelt);
5646               if (can_vec_perm_const_p (mode, indices))
5647                 return true;
5648             }
5649         }
5650     }
5651
5652   if (dump_enabled_p ())
5653     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654                      "extract even/odd not supported by target\n");
5655   return false;
5656 }
5657
5658 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5659    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5660
5661 bool
5662 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5663                            bool masked_p)
5664 {
5665   if (masked_p)
5666     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5667                                          vec_mask_load_lanes_optab,
5668                                          vectype, count);
5669   else
5670     return vect_lanes_optab_supported_p ("vec_load_lanes",
5671                                          vec_load_lanes_optab,
5672                                          vectype, count);
5673 }
5674
5675 /* Function vect_permute_load_chain.
5676
5677    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5678    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5679    the input data correctly.  Return the final references for loads in
5680    RESULT_CHAIN.
5681
5682    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5683    The input is 4 vectors each containing 8 elements. We assign a number to each
5684    element, the input sequence is:
5685
5686    1st vec:   0  1  2  3  4  5  6  7
5687    2nd vec:   8  9 10 11 12 13 14 15
5688    3rd vec:  16 17 18 19 20 21 22 23
5689    4th vec:  24 25 26 27 28 29 30 31
5690
5691    The output sequence should be:
5692
5693    1st vec:  0 4  8 12 16 20 24 28
5694    2nd vec:  1 5  9 13 17 21 25 29
5695    3rd vec:  2 6 10 14 18 22 26 30
5696    4th vec:  3 7 11 15 19 23 27 31
5697
5698    i.e., the first output vector should contain the first elements of each
5699    interleaving group, etc.
5700
5701    We use extract_even/odd instructions to create such output.  The input of
5702    each extract_even/odd operation is two vectors
5703    1st vec    2nd vec
5704    0 1 2 3    4 5 6 7
5705
5706    and the output is the vector of extracted even/odd elements.  The output of
5707    extract_even will be:   0 2 4 6
5708    and of extract_odd:     1 3 5 7
5709
5710
5711    The permutation is done in log LENGTH stages.  In each stage extract_even
5712    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5713    their order.  In our example,
5714
5715    E1: extract_even (1st vec, 2nd vec)
5716    E2: extract_odd (1st vec, 2nd vec)
5717    E3: extract_even (3rd vec, 4th vec)
5718    E4: extract_odd (3rd vec, 4th vec)
5719
5720    The output for the first stage will be:
5721
5722    E1:  0  2  4  6  8 10 12 14
5723    E2:  1  3  5  7  9 11 13 15
5724    E3: 16 18 20 22 24 26 28 30
5725    E4: 17 19 21 23 25 27 29 31
5726
5727    In order to proceed and create the correct sequence for the next stage (or
5728    for the correct output, if the second stage is the last one, as in our
5729    example), we first put the output of extract_even operation and then the
5730    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5731    The input for the second stage is:
5732
5733    1st vec (E1):  0  2  4  6  8 10 12 14
5734    2nd vec (E3): 16 18 20 22 24 26 28 30
5735    3rd vec (E2):  1  3  5  7  9 11 13 15
5736    4th vec (E4): 17 19 21 23 25 27 29 31
5737
5738    The output of the second stage:
5739
5740    E1: 0 4  8 12 16 20 24 28
5741    E2: 2 6 10 14 18 22 26 30
5742    E3: 1 5  9 13 17 21 25 29
5743    E4: 3 7 11 15 19 23 27 31
5744
5745    And RESULT_CHAIN after reordering:
5746
5747    1st vec (E1):  0 4  8 12 16 20 24 28
5748    2nd vec (E3):  1 5  9 13 17 21 25 29
5749    3rd vec (E2):  2 6 10 14 18 22 26 30
5750    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5751
5752 static void
5753 vect_permute_load_chain (vec<tree> dr_chain,
5754                          unsigned int length,
5755                          stmt_vec_info stmt_info,
5756                          gimple_stmt_iterator *gsi,
5757                          vec<tree> *result_chain)
5758 {
5759   tree data_ref, first_vect, second_vect;
5760   tree perm_mask_even, perm_mask_odd;
5761   tree perm3_mask_low, perm3_mask_high;
5762   gimple *perm_stmt;
5763   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5764   unsigned int i, j, log_length = exact_log2 (length);
5765
5766   result_chain->quick_grow (length);
5767   memcpy (result_chain->address (), dr_chain.address (),
5768           length * sizeof (tree));
5769
5770   if (length == 3)
5771     {
5772       /* vect_grouped_load_supported ensures that this is constant.  */
5773       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5774       unsigned int k;
5775
5776       vec_perm_builder sel (nelt, nelt, 1);
5777       sel.quick_grow (nelt);
5778       vec_perm_indices indices;
5779       for (k = 0; k < 3; k++)
5780         {
5781           for (i = 0; i < nelt; i++)
5782             if (3 * i + k < 2 * nelt)
5783               sel[i] = 3 * i + k;
5784             else
5785               sel[i] = 0;
5786           indices.new_vector (sel, 2, nelt);
5787           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5788
5789           for (i = 0, j = 0; i < nelt; i++)
5790             if (3 * i + k < 2 * nelt)
5791               sel[i] = i;
5792             else
5793               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5794           indices.new_vector (sel, 2, nelt);
5795           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5796
5797           first_vect = dr_chain[0];
5798           second_vect = dr_chain[1];
5799
5800           /* Create interleaving stmt (low part of):
5801              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5802                                                              ...}>  */
5803           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5804           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5805                                            second_vect, perm3_mask_low);
5806           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5807
5808           /* Create interleaving stmt (high part of):
5809              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5810                                                               ...}>  */
5811           first_vect = data_ref;
5812           second_vect = dr_chain[2];
5813           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5814           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5815                                            second_vect, perm3_mask_high);
5816           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5817           (*result_chain)[k] = data_ref;
5818         }
5819     }
5820   else
5821     {
5822       /* If length is not equal to 3 then only power of 2 is supported.  */
5823       gcc_assert (pow2p_hwi (length));
5824
5825       /* The encoding has a single stepped pattern.  */
5826       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5827       vec_perm_builder sel (nelt, 1, 3);
5828       sel.quick_grow (3);
5829       for (i = 0; i < 3; ++i)
5830         sel[i] = i * 2;
5831       vec_perm_indices indices (sel, 2, nelt);
5832       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5833
5834       for (i = 0; i < 3; ++i)
5835         sel[i] = i * 2 + 1;
5836       indices.new_vector (sel, 2, nelt);
5837       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5838
5839       for (i = 0; i < log_length; i++)
5840         {
5841           for (j = 0; j < length; j += 2)
5842             {
5843               first_vect = dr_chain[j];
5844               second_vect = dr_chain[j+1];
5845
5846               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5847               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5848               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5849                                                first_vect, second_vect,
5850                                                perm_mask_even);
5851               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5852               (*result_chain)[j/2] = data_ref;
5853
5854               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5855               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5856               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5857                                                first_vect, second_vect,
5858                                                perm_mask_odd);
5859               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5860               (*result_chain)[j/2+length/2] = data_ref;
5861             }
5862           memcpy (dr_chain.address (), result_chain->address (),
5863                   length * sizeof (tree));
5864         }
5865     }
5866 }
5867
5868 /* Function vect_shift_permute_load_chain.
5869
5870    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5871    sequence of stmts to reorder the input data accordingly.
5872    Return the final references for loads in RESULT_CHAIN.
5873    Return true if successed, false otherwise.
5874
5875    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5876    The input is 3 vectors each containing 8 elements.  We assign a
5877    number to each element, the input sequence is:
5878
5879    1st vec:   0  1  2  3  4  5  6  7
5880    2nd vec:   8  9 10 11 12 13 14 15
5881    3rd vec:  16 17 18 19 20 21 22 23
5882
5883    The output sequence should be:
5884
5885    1st vec:  0 3 6  9 12 15 18 21
5886    2nd vec:  1 4 7 10 13 16 19 22
5887    3rd vec:  2 5 8 11 14 17 20 23
5888
5889    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5890
5891    First we shuffle all 3 vectors to get correct elements order:
5892
5893    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5894    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5895    3rd vec:  (16 19 22) (17 20 23) (18 21)
5896
5897    Next we unite and shift vector 3 times:
5898
5899    1st step:
5900      shift right by 6 the concatenation of:
5901      "1st vec" and  "2nd vec"
5902        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5903      "2nd vec" and  "3rd vec"
5904        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5905      "3rd vec" and  "1st vec"
5906        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5907                              | New vectors                   |
5908
5909      So that now new vectors are:
5910
5911      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5912      2nd vec:  (10 13) (16 19 22) (17 20 23)
5913      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5914
5915    2nd step:
5916      shift right by 5 the concatenation of:
5917      "1st vec" and  "3rd vec"
5918        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5919      "2nd vec" and  "1st vec"
5920        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5921      "3rd vec" and  "2nd vec"
5922        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5923                           | New vectors                   |
5924
5925      So that now new vectors are:
5926
5927      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5928      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5929      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5930
5931    3rd step:
5932      shift right by 5 the concatenation of:
5933      "1st vec" and  "1st vec"
5934        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5935      shift right by 3 the concatenation of:
5936      "2nd vec" and  "2nd vec"
5937                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5938                           | New vectors                   |
5939
5940      So that now all vectors are READY:
5941      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5942      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5943      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5944
5945    This algorithm is faster than one in vect_permute_load_chain if:
5946      1.  "shift of a concatination" is faster than general permutation.
5947          This is usually so.
5948      2.  The TARGET machine can't execute vector instructions in parallel.
5949          This is because each step of the algorithm depends on previous.
5950          The algorithm in vect_permute_load_chain is much more parallel.
5951
5952    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5953 */
5954
5955 static bool
5956 vect_shift_permute_load_chain (vec<tree> dr_chain,
5957                                unsigned int length,
5958                                stmt_vec_info stmt_info,
5959                                gimple_stmt_iterator *gsi,
5960                                vec<tree> *result_chain)
5961 {
5962   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5963   tree perm2_mask1, perm2_mask2, perm3_mask;
5964   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5965   gimple *perm_stmt;
5966
5967   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5968   unsigned int i;
5969   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5970
5971   unsigned HOST_WIDE_INT nelt, vf;
5972   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5973       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5974     /* Not supported for variable-length vectors.  */
5975     return false;
5976
5977   vec_perm_builder sel (nelt, nelt, 1);
5978   sel.quick_grow (nelt);
5979
5980   result_chain->quick_grow (length);
5981   memcpy (result_chain->address (), dr_chain.address (),
5982           length * sizeof (tree));
5983
5984   if (pow2p_hwi (length) && vf > 4)
5985     {
5986       unsigned int j, log_length = exact_log2 (length);
5987       for (i = 0; i < nelt / 2; ++i)
5988         sel[i] = i * 2;
5989       for (i = 0; i < nelt / 2; ++i)
5990         sel[nelt / 2 + i] = i * 2 + 1;
5991       vec_perm_indices indices (sel, 2, nelt);
5992       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5993         {
5994           if (dump_enabled_p ())
5995             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5996                              "shuffle of 2 fields structure is not \
5997                               supported by target\n");
5998           return false;
5999         }
6000       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6001
6002       for (i = 0; i < nelt / 2; ++i)
6003         sel[i] = i * 2 + 1;
6004       for (i = 0; i < nelt / 2; ++i)
6005         sel[nelt / 2 + i] = i * 2;
6006       indices.new_vector (sel, 2, nelt);
6007       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6008         {
6009           if (dump_enabled_p ())
6010             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6011                              "shuffle of 2 fields structure is not \
6012                               supported by target\n");
6013           return false;
6014         }
6015       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6016
6017       /* Generating permutation constant to shift all elements.
6018          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6019       for (i = 0; i < nelt; i++)
6020         sel[i] = nelt / 2 + i;
6021       indices.new_vector (sel, 2, nelt);
6022       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6023         {
6024           if (dump_enabled_p ())
6025             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026                              "shift permutation is not supported by target\n");
6027           return false;
6028         }
6029       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6030
6031       /* Generating permutation constant to select vector from 2.
6032          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6033       for (i = 0; i < nelt / 2; i++)
6034         sel[i] = i;
6035       for (i = nelt / 2; i < nelt; i++)
6036         sel[i] = nelt + i;
6037       indices.new_vector (sel, 2, nelt);
6038       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6039         {
6040           if (dump_enabled_p ())
6041             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6042                              "select is not supported by target\n");
6043           return false;
6044         }
6045       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6046
6047       for (i = 0; i < log_length; i++)
6048         {
6049           for (j = 0; j < length; j += 2)
6050             {
6051               first_vect = dr_chain[j];
6052               second_vect = dr_chain[j + 1];
6053
6054               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6055               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6056                                                first_vect, first_vect,
6057                                                perm2_mask1);
6058               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6059               vect[0] = data_ref;
6060
6061               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6062               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6063                                                second_vect, second_vect,
6064                                                perm2_mask2);
6065               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6066               vect[1] = data_ref;
6067
6068               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6069               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6070                                                vect[0], vect[1], shift1_mask);
6071               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6072               (*result_chain)[j/2 + length/2] = data_ref;
6073
6074               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6075               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6076                                                vect[0], vect[1], select_mask);
6077               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6078               (*result_chain)[j/2] = data_ref;
6079             }
6080           memcpy (dr_chain.address (), result_chain->address (),
6081                   length * sizeof (tree));
6082         }
6083       return true;
6084     }
6085   if (length == 3 && vf > 2)
6086     {
6087       unsigned int k = 0, l = 0;
6088
6089       /* Generating permutation constant to get all elements in rigth order.
6090          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6091       for (i = 0; i < nelt; i++)
6092         {
6093           if (3 * k + (l % 3) >= nelt)
6094             {
6095               k = 0;
6096               l += (3 - (nelt % 3));
6097             }
6098           sel[i] = 3 * k + (l % 3);
6099           k++;
6100         }
6101       vec_perm_indices indices (sel, 2, nelt);
6102       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6103         {
6104           if (dump_enabled_p ())
6105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106                              "shuffle of 3 fields structure is not \
6107                               supported by target\n");
6108           return false;
6109         }
6110       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6111
6112       /* Generating permutation constant to shift all elements.
6113          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6114       for (i = 0; i < nelt; i++)
6115         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6116       indices.new_vector (sel, 2, nelt);
6117       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6118         {
6119           if (dump_enabled_p ())
6120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6121                              "shift permutation is not supported by target\n");
6122           return false;
6123         }
6124       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6125
6126       /* Generating permutation constant to shift all elements.
6127          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6128       for (i = 0; i < nelt; i++)
6129         sel[i] = 2 * (nelt / 3) + 1 + i;
6130       indices.new_vector (sel, 2, nelt);
6131       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132         {
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "shift permutation is not supported by target\n");
6136           return false;
6137         }
6138       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6139
6140       /* Generating permutation constant to shift all elements.
6141          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6142       for (i = 0; i < nelt; i++)
6143         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6144       indices.new_vector (sel, 2, nelt);
6145       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6146         {
6147           if (dump_enabled_p ())
6148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6149                              "shift permutation is not supported by target\n");
6150           return false;
6151         }
6152       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6153
6154       /* Generating permutation constant to shift all elements.
6155          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6156       for (i = 0; i < nelt; i++)
6157         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6158       indices.new_vector (sel, 2, nelt);
6159       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6160         {
6161           if (dump_enabled_p ())
6162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163                              "shift permutation is not supported by target\n");
6164           return false;
6165         }
6166       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6167
6168       for (k = 0; k < 3; k++)
6169         {
6170           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6171           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172                                            dr_chain[k], dr_chain[k],
6173                                            perm3_mask);
6174           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6175           vect[k] = data_ref;
6176         }
6177
6178       for (k = 0; k < 3; k++)
6179         {
6180           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6181           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182                                            vect[k % 3], vect[(k + 1) % 3],
6183                                            shift1_mask);
6184           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6185           vect_shift[k] = data_ref;
6186         }
6187
6188       for (k = 0; k < 3; k++)
6189         {
6190           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6191           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6192                                            vect_shift[(4 - k) % 3],
6193                                            vect_shift[(3 - k) % 3],
6194                                            shift2_mask);
6195           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6196           vect[k] = data_ref;
6197         }
6198
6199       (*result_chain)[3 - (nelt % 3)] = vect[2];
6200
6201       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6202       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6203                                        vect[0], shift3_mask);
6204       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6205       (*result_chain)[nelt % 3] = data_ref;
6206
6207       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6208       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6209                                        vect[1], shift4_mask);
6210       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6211       (*result_chain)[0] = data_ref;
6212       return true;
6213     }
6214   return false;
6215 }
6216
6217 /* Function vect_transform_grouped_load.
6218
6219    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6220    to perform their permutation and ascribe the result vectorized statements to
6221    the scalar statements.
6222 */
6223
6224 void
6225 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6226                              int size, gimple_stmt_iterator *gsi)
6227 {
6228   machine_mode mode;
6229   vec<tree> result_chain = vNULL;
6230
6231   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6232      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6233      vectors, that are ready for vector computation.  */
6234   result_chain.create (size);
6235
6236   /* If reassociation width for vector type is 2 or greater target machine can
6237      execute 2 or more vector instructions in parallel.  Otherwise try to
6238      get chain for loads group using vect_shift_permute_load_chain.  */
6239   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6240   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6241       || pow2p_hwi (size)
6242       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6243                                          gsi, &result_chain))
6244     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6245   vect_record_grouped_load_vectors (stmt_info, result_chain);
6246   result_chain.release ();
6247 }
6248
6249 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6250    generated as part of the vectorization of STMT_INFO.  Assign the statement
6251    for each vector to the associated scalar statement.  */
6252
6253 void
6254 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6255                                   vec<tree> result_chain)
6256 {
6257   vec_info *vinfo = stmt_info->vinfo;
6258   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6259   unsigned int i, gap_count;
6260   tree tmp_data_ref;
6261
6262   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6263      Since we scan the chain starting from it's first node, their order
6264      corresponds the order of data-refs in RESULT_CHAIN.  */
6265   stmt_vec_info next_stmt_info = first_stmt_info;
6266   gap_count = 1;
6267   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6268     {
6269       if (!next_stmt_info)
6270         break;
6271
6272       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6273        code elimination pass later.  No need to check for the first stmt in
6274        the group, since it always exists.
6275        DR_GROUP_GAP is the number of steps in elements from the previous
6276        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6277        correspond to the gaps.  */
6278       if (next_stmt_info != first_stmt_info
6279           && gap_count < DR_GROUP_GAP (next_stmt_info))
6280       {
6281         gap_count++;
6282         continue;
6283       }
6284
6285       while (next_stmt_info)
6286         {
6287           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6288           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6289              copies, and we put the new vector statement in the first available
6290              RELATED_STMT.  */
6291           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6292             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6293           else
6294             {
6295               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6296                 {
6297                   stmt_vec_info prev_stmt_info
6298                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6299                   stmt_vec_info rel_stmt_info
6300                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6301                   while (rel_stmt_info)
6302                     {
6303                       prev_stmt_info = rel_stmt_info;
6304                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6305                     }
6306
6307                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6308                 }
6309             }
6310
6311           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6312           gap_count = 1;
6313           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6314              put the same TMP_DATA_REF as its vectorized statement; otherwise
6315              get the next data-ref from RESULT_CHAIN.  */
6316           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6317             break;
6318         }
6319     }
6320 }
6321
6322 /* Function vect_force_dr_alignment_p.
6323
6324    Returns whether the alignment of a DECL can be forced to be aligned
6325    on ALIGNMENT bit boundary.  */
6326
6327 bool
6328 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6329 {
6330   if (!VAR_P (decl))
6331     return false;
6332
6333   if (decl_in_symtab_p (decl)
6334       && !symtab_node::get (decl)->can_increase_alignment_p ())
6335     return false;
6336
6337   if (TREE_STATIC (decl))
6338     return (known_le (alignment,
6339                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6340   else
6341     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6342 }
6343
6344
6345 /* Return whether the data reference DR_INFO is supported with respect to its
6346    alignment.
6347    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6348    it is aligned, i.e., check if it is possible to vectorize it with different
6349    alignment.  */
6350
6351 enum dr_alignment_support
6352 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6353                                bool check_aligned_accesses)
6354 {
6355   data_reference *dr = dr_info->dr;
6356   stmt_vec_info stmt_info = dr_info->stmt;
6357   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6358   machine_mode mode = TYPE_MODE (vectype);
6359   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6360   struct loop *vect_loop = NULL;
6361   bool nested_in_vect_loop = false;
6362
6363   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6364     return dr_aligned;
6365
6366   /* For now assume all conditional loads/stores support unaligned
6367      access without any special code.  */
6368   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6369     if (gimple_call_internal_p (stmt)
6370         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6371             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6372       return dr_unaligned_supported;
6373
6374   if (loop_vinfo)
6375     {
6376       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6377       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6378     }
6379
6380   /* Possibly unaligned access.  */
6381
6382   /* We can choose between using the implicit realignment scheme (generating
6383      a misaligned_move stmt) and the explicit realignment scheme (generating
6384      aligned loads with a REALIGN_LOAD).  There are two variants to the
6385      explicit realignment scheme: optimized, and unoptimized.
6386      We can optimize the realignment only if the step between consecutive
6387      vector loads is equal to the vector size.  Since the vector memory
6388      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6389      is guaranteed that the misalignment amount remains the same throughout the
6390      execution of the vectorized loop.  Therefore, we can create the
6391      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6392      at the loop preheader.
6393
6394      However, in the case of outer-loop vectorization, when vectorizing a
6395      memory access in the inner-loop nested within the LOOP that is now being
6396      vectorized, while it is guaranteed that the misalignment of the
6397      vectorized memory access will remain the same in different outer-loop
6398      iterations, it is *not* guaranteed that is will remain the same throughout
6399      the execution of the inner-loop.  This is because the inner-loop advances
6400      with the original scalar step (and not in steps of VS).  If the inner-loop
6401      step happens to be a multiple of VS, then the misalignment remains fixed
6402      and we can use the optimized realignment scheme.  For example:
6403
6404       for (i=0; i<N; i++)
6405         for (j=0; j<M; j++)
6406           s += a[i+j];
6407
6408      When vectorizing the i-loop in the above example, the step between
6409      consecutive vector loads is 1, and so the misalignment does not remain
6410      fixed across the execution of the inner-loop, and the realignment cannot
6411      be optimized (as illustrated in the following pseudo vectorized loop):
6412
6413       for (i=0; i<N; i+=4)
6414         for (j=0; j<M; j++){
6415           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6416                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6417                          // (assuming that we start from an aligned address).
6418           }
6419
6420      We therefore have to use the unoptimized realignment scheme:
6421
6422       for (i=0; i<N; i+=4)
6423           for (j=k; j<M; j+=4)
6424           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6425                            // that the misalignment of the initial address is
6426                            // 0).
6427
6428      The loop can then be vectorized as follows:
6429
6430       for (k=0; k<4; k++){
6431         rt = get_realignment_token (&vp[k]);
6432         for (i=0; i<N; i+=4){
6433           v1 = vp[i+k];
6434           for (j=k; j<M; j+=4){
6435             v2 = vp[i+j+VS-1];
6436             va = REALIGN_LOAD <v1,v2,rt>;
6437             vs += va;
6438             v1 = v2;
6439           }
6440         }
6441     } */
6442
6443   if (DR_IS_READ (dr))
6444     {
6445       bool is_packed = false;
6446       tree type = (TREE_TYPE (DR_REF (dr)));
6447
6448       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6449           && (!targetm.vectorize.builtin_mask_for_load
6450               || targetm.vectorize.builtin_mask_for_load ()))
6451         {
6452           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6453
6454           /* If we are doing SLP then the accesses need not have the
6455              same alignment, instead it depends on the SLP group size.  */
6456           if (loop_vinfo
6457               && STMT_SLP_TYPE (stmt_info)
6458               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6459                               * (DR_GROUP_SIZE
6460                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6461                               TYPE_VECTOR_SUBPARTS (vectype)))
6462             ;
6463           else if (!loop_vinfo
6464                    || (nested_in_vect_loop
6465                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6466                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6467             return dr_explicit_realign;
6468           else
6469             return dr_explicit_realign_optimized;
6470         }
6471       if (!known_alignment_for_access_p (dr_info))
6472         is_packed = not_size_aligned (DR_REF (dr));
6473
6474       if (targetm.vectorize.support_vector_misalignment
6475             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6476         /* Can't software pipeline the loads, but can at least do them.  */
6477         return dr_unaligned_supported;
6478     }
6479   else
6480     {
6481       bool is_packed = false;
6482       tree type = (TREE_TYPE (DR_REF (dr)));
6483
6484       if (!known_alignment_for_access_p (dr_info))
6485         is_packed = not_size_aligned (DR_REF (dr));
6486
6487      if (targetm.vectorize.support_vector_misalignment
6488            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6489        return dr_unaligned_supported;
6490     }
6491
6492   /* Unsupported.  */
6493   return dr_unaligned_unsupported;
6494 }