gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static opt_result
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return opt_result::failure_at (vect_location,
 166                                    "will not create alias checks, as"
 167                                    " --param vect-max-version-for-alias-checks"
 168                                    " == 0\n");
 169
 170   opt_result res
 171     = runtime_alias_check_p (ddr, loop,
 172                              optimize_loop_nest_for_speed_p (loop));
 173   if (!res)
 174     return res;
 175
 176   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 177   return opt_result::success ();
 178 }
 179
 180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 181
 182 static void
 183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 184 {
 185   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 186   for (unsigned int i = 0; i < checks.length(); ++i)
 187     if (checks[i] == value)
 188       return;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "need run-time check that %T is nonzero\n",
 193                      value);
 194   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 195 }
 196
 197 /* Return true if we know that the order of vectorized DR_INFO_A and
 198    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 199    DR_INFO_B.  At least one of the accesses is a write.  */
 200
 201 static bool
 202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 203 {
 204   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 205   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 206
 207   /* Single statements are always kept in their original order.  */
 208   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 209       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 210     return true;
 211
 212   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 213      group are emitted at the position of the last scalar load and all
 214      stores in a group are emitted at the position of the last scalar store.
 215      Compute that position and check whether the resulting order matches
 216      the current one.  */
 217   stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 218   if (last_a)
 219     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 220          s = DR_GROUP_NEXT_ELEMENT (s))
 221       last_a = get_later_stmt (last_a, s);
 222   else
 223     last_a = stmtinfo_a;
 224   stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 225   if (last_b)
 226     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 227          s = DR_GROUP_NEXT_ELEMENT (s))
 228       last_b = get_later_stmt (last_b, s);
 229   else
 230     last_b = stmtinfo_b;
 231   return ((get_later_stmt (last_a, last_b) == last_a)
 232           == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
 233 }
 234
 235 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 236    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 237    distances.  These distances are conservatively correct but they don't
 238    reflect a guaranteed dependence.
 239
 240    Return true if this function does all the work necessary to avoid
 241    an alias or false if the caller should use the dependence distances
 242    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 243    the depth of the loop described by LOOP_VINFO and the other arguments
 244    are as for vect_analyze_data_ref_dependence.  */
 245
 246 static bool
 247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 248                                        loop_vec_info loop_vinfo,
 249                                        int loop_depth, unsigned int *max_vf)
 250 {
 251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 252   lambda_vector dist_v;
 253   unsigned int i;
 254   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 255     {
 256       int dist = dist_v[loop_depth];
 257       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 258         {
 259           /* If the user asserted safelen >= DIST consecutive iterations
 260              can be executed concurrently, assume independence.
 261
 262              ??? An alternative would be to add the alias check even
 263              in this case, and vectorize the fallback loop with the
 264              maximum VF set to safelen.  However, if the user has
 265              explicitly given a length, it's less likely that that
 266              would be a win.  */
 267           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 268             {
 269               if ((unsigned int) loop->safelen < *max_vf)
 270                 *max_vf = loop->safelen;
 271               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272               continue;
 273             }
 274
 275           /* For dependence distances of 2 or more, we have the option
 276              of limiting VF or checking for an alias at runtime.
 277              Prefer to check at runtime if we can, to avoid limiting
 278              the VF unnecessarily when the bases are in fact independent.
 279
 280              Note that the alias checks will be removed if the VF ends up
 281              being small enough.  */
 282           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 283           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 284           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 285                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 286                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 287         }
 288     }
 289   return true;
 290 }
 291
 292
 293 /* Function vect_analyze_data_ref_dependence.
 294
 295    FIXME: I needed to change the sense of the returned flag.
 296
 297    Return FALSE if there (might) exist a dependence between a memory-reference
 298    DRA and a memory-reference DRB.  When versioning for alias may check a
 299    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 300    the data dependence.  */
 301
 302 static opt_result
 303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 304                                   loop_vec_info loop_vinfo,
 305                                   unsigned int *max_vf)
 306 {
 307   unsigned int i;
 308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   struct data_reference *dra = DDR_A (ddr);
 310   struct data_reference *drb = DDR_B (ddr);
 311   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 312   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 313   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 314   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 315   lambda_vector dist_v;
 316   unsigned int loop_depth;
 317
 318   /* In loop analysis all data references should be vectorizable.  */
 319   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 320       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 321     gcc_unreachable ();
 322
 323   /* Independent data accesses.  */
 324   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 325     return opt_result::success ();
 326
 327   if (dra == drb
 328       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 329     return opt_result::success ();
 330
 331   /* We do not have to consider dependences between accesses that belong
 332      to the same group, unless the stride could be smaller than the
 333      group size.  */
 334   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 335       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 336           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 337       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 338     return opt_result::success ();
 339
 340   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 341      least two scalar iterations, there is always also a true dependence.
 342      As the vectorizer does not re-order loads and stores we can ignore
 343      the anti-dependence if TBAA can disambiguate both DRs similar to the
 344      case with known negative distance anti-dependences (positive
 345      distance anti-dependences would violate TBAA constraints).  */
 346   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 347        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 348       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 349                                  get_alias_set (DR_REF (drb))))
 350     return opt_result::success ();
 351
 352   /* Unknown data dependence.  */
 353   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 354     {
 355       /* If user asserted safelen consecutive iterations can be
 356          executed concurrently, assume independence.  */
 357       if (loop->safelen >= 2)
 358         {
 359           if ((unsigned int) loop->safelen < *max_vf)
 360             *max_vf = loop->safelen;
 361           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 362           return opt_result::success ();
 363         }
 364
 365       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 366           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 367         return opt_result::failure_at
 368           (stmtinfo_a->stmt,
 369            "versioning for alias not supported for: "
 370            "can't determine dependence between %T and %T\n",
 371            DR_REF (dra), DR_REF (drb));
 372
 373       if (dump_enabled_p ())
 374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 375                          "versioning for alias required: "
 376                          "can't determine dependence between %T and %T\n",
 377                          DR_REF (dra), DR_REF (drb));
 378
 379       /* Add to list of ddrs that need to be tested at run-time.  */
 380       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 381     }
 382
 383   /* Known data dependence.  */
 384   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 385     {
 386       /* If user asserted safelen consecutive iterations can be
 387          executed concurrently, assume independence.  */
 388       if (loop->safelen >= 2)
 389         {
 390           if ((unsigned int) loop->safelen < *max_vf)
 391             *max_vf = loop->safelen;
 392           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 393           return opt_result::success ();
 394         }
 395
 396       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 397           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 398         return opt_result::failure_at
 399           (stmtinfo_a->stmt,
 400            "versioning for alias not supported for: "
 401            "bad dist vector for %T and %T\n",
 402            DR_REF (dra), DR_REF (drb));
 403
 404       if (dump_enabled_p ())
 405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 406                          "versioning for alias required: "
 407                          "bad dist vector for %T and %T\n",
 408                          DR_REF (dra), DR_REF (drb));
 409       /* Add to list of ddrs that need to be tested at run-time.  */
 410       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 411     }
 412
 413   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 414
 415   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 416       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 417                                                 loop_depth, max_vf))
 418     return opt_result::success ();
 419
 420   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 421     {
 422       int dist = dist_v[loop_depth];
 423
 424       if (dump_enabled_p ())
 425         dump_printf_loc (MSG_NOTE, vect_location,
 426                          "dependence distance  = %d.\n", dist);
 427
 428       if (dist == 0)
 429         {
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_NOTE, vect_location,
 432                              "dependence distance == 0 between %T and %T\n",
 433                              DR_REF (dra), DR_REF (drb));
 434
 435           /* When we perform grouped accesses and perform implicit CSE
 436              by detecting equal accesses and doing disambiguation with
 437              runtime alias tests like for
 438                 .. = a[i];
 439                 .. = a[i+1];
 440                 a[i] = ..;
 441                 a[i+1] = ..;
 442                 *p = ..;
 443                 .. = a[i];
 444                 .. = a[i+1];
 445              where we will end up loading { a[i], a[i+1] } once, make
 446              sure that inserting group loads before the first load and
 447              stores after the last store will do the right thing.
 448              Similar for groups like
 449                 a[i] = ...;
 450                 ... = a[i];
 451                 a[i+1] = ...;
 452              where loads from the group interleave with the store.  */
 453           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 454             return opt_result::failure_at (stmtinfo_a->stmt,
 455                                            "READ_WRITE dependence"
 456                                            " in interleaving.\n");
 457
 458           if (loop->safelen < 2)
 459             {
 460               tree indicator = dr_zero_step_indicator (dra);
 461               if (!indicator || integer_zerop (indicator))
 462                 return opt_result::failure_at (stmtinfo_a->stmt,
 463                                                "access also has a zero step\n");
 464               else if (TREE_CODE (indicator) != INTEGER_CST)
 465                 vect_check_nonzero_value (loop_vinfo, indicator);
 466             }
 467           continue;
 468         }
 469
 470       if (dist > 0 && DDR_REVERSED_P (ddr))
 471         {
 472           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 473              reversed (to make distance vector positive), and the actual
 474              distance is negative.  */
 475           if (dump_enabled_p ())
 476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                              "dependence distance negative.\n");
 478           /* Record a negative dependence distance to later limit the
 479              amount of stmt copying / unrolling we can perform.
 480              Only need to handle read-after-write dependence.  */
 481           if (DR_IS_READ (drb)
 482               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 483                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 484             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 485           continue;
 486         }
 487
 488       unsigned int abs_dist = abs (dist);
 489       if (abs_dist >= 2 && abs_dist < *max_vf)
 490         {
 491           /* The dependence distance requires reduction of the maximal
 492              vectorization factor.  */
 493           *max_vf = abs (dist);
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location,
 496                              "adjusting maximal vectorization factor to %i\n",
 497                              *max_vf);
 498         }
 499
 500       if (abs_dist >= *max_vf)
 501         {
 502           /* Dependence distance does not create dependence, as far as
 503              vectorization is concerned, in this case.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "dependence distance >= VF.\n");
 507           continue;
 508         }
 509
 510       return opt_result::failure_at (stmtinfo_a->stmt,
 511                                      "not vectorized, possible dependence "
 512                                      "between data-refs %T and %T\n",
 513                                      DR_REF (dra), DR_REF (drb));
 514     }
 515
 516   return opt_result::success ();
 517 }
 518
 519 /* Function vect_analyze_data_ref_dependences.
 520
 521    Examine all the data references in the loop, and make sure there do not
 522    exist any data dependences between them.  Set *MAX_VF according to
 523    the maximum vectorization factor the data dependences allow.  */
 524
 525 opt_result
 526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 527                                    unsigned int *max_vf)
 528 {
 529   unsigned int i;
 530   struct data_dependence_relation *ddr;
 531
 532   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 533
 534   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 535     {
 536       LOOP_VINFO_DDRS (loop_vinfo)
 537         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 538                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 539       /* We need read-read dependences to compute
 540          STMT_VINFO_SAME_ALIGN_REFS.  */
 541       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 542                                           &LOOP_VINFO_DDRS (loop_vinfo),
 543                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 544                                           true);
 545       gcc_assert (res);
 546     }
 547
 548   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 549
 550   /* For epilogues we either have no aliases or alias versioning
 551      was applied to original loop.  Therefore we may just get max_vf
 552      using VF of original loop.  */
 553   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 554     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 555   else
 556     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 557       {
 558         opt_result res
 559           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 560         if (!res)
 561           return res;
 562       }
 563
 564   return opt_result::success ();
 565 }
 566
 567
 568 /* Function vect_slp_analyze_data_ref_dependence.
 569
 570    Return TRUE if there (might) exist a dependence between a memory-reference
 571    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 572    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 573    according to the data dependence.  */
 574
 575 static bool
 576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 577                                       struct data_dependence_relation *ddr)
 578 {
 579   struct data_reference *dra = DDR_A (ddr);
 580   struct data_reference *drb = DDR_B (ddr);
 581   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 582   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 583
 584   /* We need to check dependences of statements marked as unvectorizable
 585      as well, they still can prohibit vectorization.  */
 586
 587   /* Independent data accesses.  */
 588   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 589     return false;
 590
 591   if (dra == drb)
 592     return false;
 593
 594   /* Read-read is OK.  */
 595   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 596     return false;
 597
 598   /* If dra and drb are part of the same interleaving chain consider
 599      them independent.  */
 600   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 601       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 602           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 603     return false;
 604
 605   /* Unknown data dependence.  */
 606   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 607     {
 608       if  (dump_enabled_p ())
 609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 610                          "can't determine dependence between %T and %T\n",
 611                          DR_REF (dra), DR_REF (drb));
 612     }
 613   else if (dump_enabled_p ())
 614     dump_printf_loc (MSG_NOTE, vect_location,
 615                      "determined dependence between %T and %T\n",
 616                      DR_REF (dra), DR_REF (drb));
 617
 618   return true;
 619 }
 620
 621
 622 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 623    contain the vector of scalar stores of this instance if we are
 624    disambiguating the loads.  */
 625
 626 static bool
 627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 628                                    vec<stmt_vec_info> stores,
 629                                    stmt_vec_info last_store_info)
 630 {
 631   /* This walks over all stmts involved in the SLP load/store done
 632      in NODE verifying we can sink them up to the last stmt in the
 633      group.  */
 634   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 635   vec_info *vinfo = last_access_info->vinfo;
 636   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 637     {
 638       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 639       if (access_info == last_access_info)
 640         continue;
 641       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 642       ao_ref ref;
 643       bool ref_initialized_p = false;
 644       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 645            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 646         {
 647           gimple *stmt = gsi_stmt (gsi);
 648           if (! gimple_vuse (stmt)
 649               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 650             continue;
 651
 652           /* If we couldn't record a (single) data reference for this
 653              stmt we have to resort to the alias oracle.  */
 654           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 655           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 656           if (!dr_b)
 657             {
 658               /* We are moving a store or sinking a load - this means
 659                  we cannot use TBAA for disambiguation.  */
 660               if (!ref_initialized_p)
 661                 ao_ref_init (&ref, DR_REF (dr_a));
 662               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 663                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 664                 return false;
 665               continue;
 666             }
 667
 668           bool dependent = false;
 669           /* If we run into a store of this same instance (we've just
 670              marked those) then delay dependence checking until we run
 671              into the last store because this is where it will have
 672              been sunk to (and we verify if we can do that as well).  */
 673           if (gimple_visited_p (stmt))
 674             {
 675               if (stmt_info != last_store_info)
 676                 continue;
 677               unsigned i;
 678               stmt_vec_info store_info;
 679               FOR_EACH_VEC_ELT (stores, i, store_info)
 680                 {
 681                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 682                   ddr_p ddr = initialize_data_dependence_relation
 683                                 (dr_a, store_dr, vNULL);
 684                   dependent
 685                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 686                   free_dependence_relation (ddr);
 687                   if (dependent)
 688                     break;
 689                 }
 690             }
 691           else
 692             {
 693               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 694                                                                dr_b, vNULL);
 695               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 696               free_dependence_relation (ddr);
 697             }
 698           if (dependent)
 699             return false;
 700         }
 701     }
 702   return true;
 703 }
 704
 705
 706 /* Function vect_analyze_data_ref_dependences.
 707
 708    Examine all the data references in the basic-block, and make sure there
 709    do not exist any data dependences between them.  Set *MAX_VF according to
 710    the maximum vectorization factor the data dependences allow.  */
 711
 712 bool
 713 vect_slp_analyze_instance_dependence (slp_instance instance)
 714 {
 715   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 716
 717   /* The stores of this instance are at the root of the SLP tree.  */
 718   slp_tree store = SLP_INSTANCE_TREE (instance);
 719   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 720     store = NULL;
 721
 722   /* Verify we can sink stores to the vectorized stmt insert location.  */
 723   stmt_vec_info last_store_info = NULL;
 724   if (store)
 725     {
 726       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 727         return false;
 728
 729       /* Mark stores in this instance and remember the last one.  */
 730       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 731       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 732         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 733     }
 734
 735   bool res = true;
 736
 737   /* Verify we can sink loads to the vectorized stmt insert location,
 738      special-casing stores of this instance.  */
 739   slp_tree load;
 740   unsigned int i;
 741   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 742     if (! vect_slp_analyze_node_dependences (instance, load,
 743                                              store
 744                                              ? SLP_TREE_SCALAR_STMTS (store)
 745                                              : vNULL, last_store_info))
 746       {
 747         res = false;
 748         break;
 749       }
 750
 751   /* Unset the visited flag.  */
 752   if (store)
 753     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 754       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 755
 756   return res;
 757 }
 758
 759 /* Record the base alignment guarantee given by DRB, which occurs
 760    in STMT_INFO.  */
 761
 762 static void
 763 vect_record_base_alignment (stmt_vec_info stmt_info,
 764                             innermost_loop_behavior *drb)
 765 {
 766   vec_info *vinfo = stmt_info->vinfo;
 767   bool existed;
 768   innermost_loop_behavior *&entry
 769     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 770   if (!existed || entry->base_alignment < drb->base_alignment)
 771     {
 772       entry = drb;
 773       if (dump_enabled_p ())
 774         dump_printf_loc (MSG_NOTE, vect_location,
 775                          "recording new base alignment for %T\n"
 776                          "  alignment:    %d\n"
 777                          "  misalignment: %d\n"
 778                          "  based on:     %G",
 779                          drb->base_address,
 780                          drb->base_alignment,
 781                          drb->base_misalignment,
 782                          stmt_info->stmt);
 783     }
 784 }
 785
 786 /* If the region we're going to vectorize is reached, all unconditional
 787    data references occur at least once.  We can therefore pool the base
 788    alignment guarantees from each unconditional reference.  Do this by
 789    going through all the data references in VINFO and checking whether
 790    the containing statement makes the reference unconditionally.  If so,
 791    record the alignment of the base address in VINFO so that it can be
 792    used for all other references with the same base.  */
 793
 794 void
 795 vect_record_base_alignments (vec_info *vinfo)
 796 {
 797   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 798   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 799   data_reference *dr;
 800   unsigned int i;
 801   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 802     {
 803       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 804       stmt_vec_info stmt_info = dr_info->stmt;
 805       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 806           && STMT_VINFO_VECTORIZABLE (stmt_info)
 807           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 808         {
 809           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 810
 811           /* If DR is nested in the loop that is being vectorized, we can also
 812              record the alignment of the base wrt the outer loop.  */
 813           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 814             vect_record_base_alignment
 815               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 816         }
 817     }
 818 }
 819
 820 /* Return the target alignment for the vectorized form of DR_INFO.  */
 821
 822 static poly_uint64
 823 vect_calculate_target_alignment (dr_vec_info *dr_info)
 824 {
 825   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 826   return targetm.vectorize.preferred_vector_alignment (vectype);
 827 }
 828
 829 /* Function vect_compute_data_ref_alignment
 830
 831    Compute the misalignment of the data reference DR_INFO.
 832
 833    Output:
 834    1. DR_MISALIGNMENT (DR_INFO) is defined.
 835
 836    FOR NOW: No analysis is actually performed. Misalignment is calculated
 837    only for trivial cases. TODO.  */
 838
 839 static void
 840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 841 {
 842   stmt_vec_info stmt_info = dr_info->stmt;
 843   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 844   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 845   struct loop *loop = NULL;
 846   tree ref = DR_REF (dr_info->dr);
 847   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 848
 849   if (dump_enabled_p ())
 850     dump_printf_loc (MSG_NOTE, vect_location,
 851                      "vect_compute_data_ref_alignment:\n");
 852
 853   if (loop_vinfo)
 854     loop = LOOP_VINFO_LOOP (loop_vinfo);
 855
 856   /* Initialize misalignment to unknown.  */
 857   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 858
 859   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 860     return;
 861
 862   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 863   bool step_preserves_misalignment_p;
 864
 865   poly_uint64 vector_alignment
 866     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 867   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 868
 869   unsigned HOST_WIDE_INT vect_align_c;
 870   if (!vector_alignment.is_constant (&vect_align_c))
 871     return;
 872
 873   /* No step for BB vectorization.  */
 874   if (!loop)
 875     {
 876       gcc_assert (integer_zerop (drb->step));
 877       step_preserves_misalignment_p = true;
 878     }
 879
 880   /* In case the dataref is in an inner-loop of the loop that is being
 881      vectorized (LOOP), we use the base and misalignment information
 882      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 883      stays the same throughout the execution of the inner-loop, which is why
 884      we have to check that the stride of the dataref in the inner-loop evenly
 885      divides by the vector alignment.  */
 886   else if (nested_in_vect_loop_p (loop, stmt_info))
 887     {
 888       step_preserves_misalignment_p
 889         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 890
 891       if (dump_enabled_p ())
 892         {
 893           if (step_preserves_misalignment_p)
 894             dump_printf_loc (MSG_NOTE, vect_location,
 895                              "inner step divides the vector alignment.\n");
 896           else
 897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                              "inner step doesn't divide the vector"
 899                              " alignment.\n");
 900         }
 901     }
 902
 903   /* Similarly we can only use base and misalignment information relative to
 904      an innermost loop if the misalignment stays the same throughout the
 905      execution of the loop.  As above, this is the case if the stride of
 906      the dataref evenly divides by the alignment.  */
 907   else
 908     {
 909       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 910       step_preserves_misalignment_p
 911         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 912
 913       if (!step_preserves_misalignment_p && dump_enabled_p ())
 914         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                          "step doesn't divide the vector alignment.\n");
 916     }
 917
 918   unsigned int base_alignment = drb->base_alignment;
 919   unsigned int base_misalignment = drb->base_misalignment;
 920
 921   /* Calculate the maximum of the pooled base address alignment and the
 922      alignment that we can compute for DR itself.  */
 923   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 924   if (entry && base_alignment < (*entry)->base_alignment)
 925     {
 926       base_alignment = (*entry)->base_alignment;
 927       base_misalignment = (*entry)->base_misalignment;
 928     }
 929
 930   if (drb->offset_alignment < vect_align_c
 931       || !step_preserves_misalignment_p
 932       /* We need to know whether the step wrt the vectorized loop is
 933          negative when computing the starting misalignment below.  */
 934       || TREE_CODE (drb->step) != INTEGER_CST)
 935     {
 936       if (dump_enabled_p ())
 937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 938                          "Unknown alignment for access: %T\n", ref);
 939       return;
 940     }
 941
 942   if (base_alignment < vect_align_c)
 943     {
 944       unsigned int max_alignment;
 945       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 946       if (max_alignment < vect_align_c
 947           || !vect_can_force_dr_alignment_p (base,
 948                                              vect_align_c * BITS_PER_UNIT))
 949         {
 950           if (dump_enabled_p ())
 951             dump_printf_loc (MSG_NOTE, vect_location,
 952                              "can't force alignment of ref: %T\n", ref);
 953           return;
 954         }
 955
 956       /* Force the alignment of the decl.
 957          NOTE: This is the only change to the code we make during
 958          the analysis phase, before deciding to vectorize the loop.  */
 959       if (dump_enabled_p ())
 960         dump_printf_loc (MSG_NOTE, vect_location,
 961                          "force alignment of %T\n", ref);
 962
 963       dr_info->base_decl = base;
 964       dr_info->base_misaligned = true;
 965       base_misalignment = 0;
 966     }
 967   poly_int64 misalignment
 968     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 969
 970   /* If this is a backward running DR then first access in the larger
 971      vectype actually is N-1 elements before the address in the DR.
 972      Adjust misalign accordingly.  */
 973   if (tree_int_cst_sgn (drb->step) < 0)
 974     /* PLUS because STEP is negative.  */
 975     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 976                      * TREE_INT_CST_LOW (drb->step));
 977
 978   unsigned int const_misalignment;
 979   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
 980     {
 981       if (dump_enabled_p ())
 982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 983                          "Non-constant misalignment for access: %T\n", ref);
 984       return;
 985     }
 986
 987   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
 988
 989   if (dump_enabled_p ())
 990     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 991                      "misalign = %d bytes of ref %T\n",
 992                      DR_MISALIGNMENT (dr_info), ref);
 993
 994   return;
 995 }
 996
 997 /* Function vect_update_misalignment_for_peel.
 998    Sets DR_INFO's misalignment
 999    - to 0 if it has the same alignment as DR_PEEL_INFO,
1000    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1001    - to -1 (unknown) otherwise.
1002
1003    DR_INFO - the data reference whose misalignment is to be adjusted.
1004    DR_PEEL_INFO - the data reference whose misalignment is being made
1005                   zero in the vector loop by the peel.
1006    NPEEL - the number of iterations in the peel loop if the misalignment
1007            of DR_PEEL_INFO is known at compile time.  */
1008
1009 static void
1010 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1011                                    dr_vec_info *dr_peel_info, int npeel)
1012 {
1013   unsigned int i;
1014   vec<dr_p> same_aligned_drs;
1015   struct data_reference *current_dr;
1016   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1017
1018   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1019      it is aligned in the vector loop.  */
1020   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1021   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1022     {
1023       if (current_dr != dr_info->dr)
1024         continue;
1025       gcc_assert (!known_alignment_for_access_p (dr_info)
1026                   || !known_alignment_for_access_p (dr_peel_info)
1027                   || (DR_MISALIGNMENT (dr_info)
1028                       == DR_MISALIGNMENT (dr_peel_info)));
1029       SET_DR_MISALIGNMENT (dr_info, 0);
1030       return;
1031     }
1032
1033   unsigned HOST_WIDE_INT alignment;
1034   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1035       && known_alignment_for_access_p (dr_info)
1036       && known_alignment_for_access_p (dr_peel_info))
1037     {
1038       int misal = DR_MISALIGNMENT (dr_info);
1039       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1040       misal &= alignment - 1;
1041       SET_DR_MISALIGNMENT (dr_info, misal);
1042       return;
1043     }
1044
1045   if (dump_enabled_p ())
1046     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1047                      "to unknown (-1).\n");
1048   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1049 }
1050
1051
1052 /* Function verify_data_ref_alignment
1053
1054    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1055
1056 static opt_result
1057 verify_data_ref_alignment (dr_vec_info *dr_info)
1058 {
1059   enum dr_alignment_support supportable_dr_alignment
1060     = vect_supportable_dr_alignment (dr_info, false);
1061   if (!supportable_dr_alignment)
1062     return opt_result::failure_at
1063       (dr_info->stmt->stmt,
1064        DR_IS_READ (dr_info->dr)
1065         ? "not vectorized: unsupported unaligned load: %T\n"
1066         : "not vectorized: unsupported unaligned store: %T\n",
1067        DR_REF (dr_info->dr));
1068
1069   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1070     dump_printf_loc (MSG_NOTE, vect_location,
1071                      "Vectorizing an unaligned access.\n");
1072
1073   return opt_result::success ();
1074 }
1075
1076 /* Function vect_verify_datarefs_alignment
1077
1078    Return TRUE if all data references in the loop can be
1079    handled with respect to alignment.  */
1080
1081 opt_result
1082 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1083 {
1084   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1085   struct data_reference *dr;
1086   unsigned int i;
1087
1088   FOR_EACH_VEC_ELT (datarefs, i, dr)
1089     {
1090       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1091       stmt_vec_info stmt_info = dr_info->stmt;
1092
1093       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1094         continue;
1095
1096       /* For interleaving, only the alignment of the first access matters.   */
1097       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1098           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1099         continue;
1100
1101       /* Strided accesses perform only component accesses, alignment is
1102          irrelevant for them.  */
1103       if (STMT_VINFO_STRIDED_P (stmt_info)
1104           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1105         continue;
1106
1107       opt_result res = verify_data_ref_alignment (dr_info);
1108       if (!res)
1109         return res;
1110     }
1111
1112   return opt_result::success ();
1113 }
1114
1115 /* Given an memory reference EXP return whether its alignment is less
1116    than its size.  */
1117
1118 static bool
1119 not_size_aligned (tree exp)
1120 {
1121   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1122     return true;
1123
1124   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1125           > get_object_alignment (exp));
1126 }
1127
1128 /* Function vector_alignment_reachable_p
1129
1130    Return true if vector alignment for DR_INFO is reachable by peeling
1131    a few loop iterations.  Return false otherwise.  */
1132
1133 static bool
1134 vector_alignment_reachable_p (dr_vec_info *dr_info)
1135 {
1136   stmt_vec_info stmt_info = dr_info->stmt;
1137   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1138
1139   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1140     {
1141       /* For interleaved access we peel only if number of iterations in
1142          the prolog loop ({VF - misalignment}), is a multiple of the
1143          number of the interleaved accesses.  */
1144       int elem_size, mis_in_elements;
1145
1146       /* FORNOW: handle only known alignment.  */
1147       if (!known_alignment_for_access_p (dr_info))
1148         return false;
1149
1150       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1151       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1152       elem_size = vector_element_size (vector_size, nelements);
1153       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1154
1155       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1156         return false;
1157     }
1158
1159   /* If misalignment is known at the compile time then allow peeling
1160      only if natural alignment is reachable through peeling.  */
1161   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1162     {
1163       HOST_WIDE_INT elmsize =
1164                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1165       if (dump_enabled_p ())
1166         {
1167           dump_printf_loc (MSG_NOTE, vect_location,
1168                            "data size = %wd. misalignment = %d.\n", elmsize,
1169                            DR_MISALIGNMENT (dr_info));
1170         }
1171       if (DR_MISALIGNMENT (dr_info) % elmsize)
1172         {
1173           if (dump_enabled_p ())
1174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175                              "data size does not divide the misalignment.\n");
1176           return false;
1177         }
1178     }
1179
1180   if (!known_alignment_for_access_p (dr_info))
1181     {
1182       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1183       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1184       if (dump_enabled_p ())
1185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                          "Unknown misalignment, %snaturally aligned\n",
1187                          is_packed ? "not " : "");
1188       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1189     }
1190
1191   return true;
1192 }
1193
1194
1195 /* Calculate the cost of the memory access represented by DR_INFO.  */
1196
1197 static void
1198 vect_get_data_access_cost (dr_vec_info *dr_info,
1199                            unsigned int *inside_cost,
1200                            unsigned int *outside_cost,
1201                            stmt_vector_for_cost *body_cost_vec,
1202                            stmt_vector_for_cost *prologue_cost_vec)
1203 {
1204   stmt_vec_info stmt_info = dr_info->stmt;
1205   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206   int ncopies;
1207
1208   if (PURE_SLP_STMT (stmt_info))
1209     ncopies = 1;
1210   else
1211     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1212
1213   if (DR_IS_READ (dr_info->dr))
1214     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1215                         prologue_cost_vec, body_cost_vec, false);
1216   else
1217     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1218
1219   if (dump_enabled_p ())
1220     dump_printf_loc (MSG_NOTE, vect_location,
1221                      "vect_get_data_access_cost: inside_cost = %d, "
1222                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1223 }
1224
1225
1226 typedef struct _vect_peel_info
1227 {
1228   dr_vec_info *dr_info;
1229   int npeel;
1230   unsigned int count;
1231 } *vect_peel_info;
1232
1233 typedef struct _vect_peel_extended_info
1234 {
1235   struct _vect_peel_info peel_info;
1236   unsigned int inside_cost;
1237   unsigned int outside_cost;
1238 } *vect_peel_extended_info;
1239
1240
1241 /* Peeling hashtable helpers.  */
1242
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1244 {
1245   static inline hashval_t hash (const _vect_peel_info *);
1246   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1247 };
1248
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1251 {
1252   return (hashval_t) peel_info->npeel;
1253 }
1254
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1257 {
1258   return (a->npeel == b->npeel);
1259 }
1260
1261
1262 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1263
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1267                           int npeel)
1268 {
1269   struct _vect_peel_info elem, *slot;
1270   _vect_peel_info **new_slot;
1271   bool supportable_dr_alignment
1272     = vect_supportable_dr_alignment (dr_info, true);
1273
1274   elem.npeel = npeel;
1275   slot = peeling_htab->find (&elem);
1276   if (slot)
1277     slot->count++;
1278   else
1279     {
1280       slot = XNEW (struct _vect_peel_info);
1281       slot->npeel = npeel;
1282       slot->dr_info = dr_info;
1283       slot->count = 1;
1284       new_slot = peeling_htab->find_slot (slot, INSERT);
1285       *new_slot = slot;
1286     }
1287
1288   if (!supportable_dr_alignment
1289       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1290     slot->count += VECT_MAX_COST;
1291 }
1292
1293
1294 /* Traverse peeling hash table to find peeling option that aligns maximum
1295    number of data accesses.  */
1296
1297 int
1298 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1299                                      _vect_peel_extended_info *max)
1300 {
1301   vect_peel_info elem = *slot;
1302
1303   if (elem->count > max->peel_info.count
1304       || (elem->count == max->peel_info.count
1305           && max->peel_info.npeel > elem->npeel))
1306     {
1307       max->peel_info.npeel = elem->npeel;
1308       max->peel_info.count = elem->count;
1309       max->peel_info.dr_info = elem->dr_info;
1310     }
1311
1312   return 1;
1313 }
1314
1315 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1316    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1317    we assume DR0_INFO's misalignment will be zero after peeling.  */
1318
1319 static void
1320 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1321                                 dr_vec_info *dr0_info,
1322                                 unsigned int *inside_cost,
1323                                 unsigned int *outside_cost,
1324                                 stmt_vector_for_cost *body_cost_vec,
1325                                 stmt_vector_for_cost *prologue_cost_vec,
1326                                 unsigned int npeel,
1327                                 bool unknown_misalignment)
1328 {
1329   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1330   unsigned i;
1331   data_reference *dr;
1332
1333   FOR_EACH_VEC_ELT (datarefs, i, dr)
1334     {
1335       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1336       stmt_vec_info stmt_info = dr_info->stmt;
1337       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1338         continue;
1339
1340       /* For interleaving, only the alignment of the first access
1341          matters.  */
1342       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1344         continue;
1345
1346       /* Strided accesses perform only component accesses, alignment is
1347          irrelevant for them.  */
1348       if (STMT_VINFO_STRIDED_P (stmt_info)
1349           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350         continue;
1351
1352       int save_misalignment;
1353       save_misalignment = DR_MISALIGNMENT (dr_info);
1354       if (npeel == 0)
1355         ;
1356       else if (unknown_misalignment && dr_info == dr0_info)
1357         SET_DR_MISALIGNMENT (dr_info, 0);
1358       else
1359         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1360       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1361                                  body_cost_vec, prologue_cost_vec);
1362       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1363     }
1364 }
1365
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367    Find the one with the lowest cost.  */
1368
1369 int
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1371                                    _vect_peel_extended_info *min)
1372 {
1373   vect_peel_info elem = *slot;
1374   int dummy;
1375   unsigned int inside_cost = 0, outside_cost = 0;
1376   stmt_vec_info stmt_info = elem->dr_info->stmt;
1377   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1378   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1379                        epilogue_cost_vec;
1380
1381   prologue_cost_vec.create (2);
1382   body_cost_vec.create (2);
1383   epilogue_cost_vec.create (2);
1384
1385   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1386                                   &outside_cost, &body_cost_vec,
1387                                   &prologue_cost_vec, elem->npeel, false);
1388
1389   body_cost_vec.release ();
1390
1391   outside_cost += vect_get_known_peeling_cost
1392     (loop_vinfo, elem->npeel, &dummy,
1393      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1394      &prologue_cost_vec, &epilogue_cost_vec);
1395
1396   /* Prologue and epilogue costs are added to the target model later.
1397      These costs depend only on the scalar iteration cost, the
1398      number of peeling iterations finally chosen, and the number of
1399      misaligned statements.  So discard the information found here.  */
1400   prologue_cost_vec.release ();
1401   epilogue_cost_vec.release ();
1402
1403   if (inside_cost < min->inside_cost
1404       || (inside_cost == min->inside_cost
1405           && outside_cost < min->outside_cost))
1406     {
1407       min->inside_cost = inside_cost;
1408       min->outside_cost = outside_cost;
1409       min->peel_info.dr_info = elem->dr_info;
1410       min->peel_info.npeel = elem->npeel;
1411       min->peel_info.count = elem->count;
1412     }
1413
1414   return 1;
1415 }
1416
1417
1418 /* Choose best peeling option by traversing peeling hash table and either
1419    choosing an option with the lowest cost (if cost model is enabled) or the
1420    option that aligns as many accesses as possible.  */
1421
1422 static struct _vect_peel_extended_info
1423 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1424                                        loop_vec_info loop_vinfo)
1425 {
1426    struct _vect_peel_extended_info res;
1427
1428    res.peel_info.dr_info = NULL;
1429
1430    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1431      {
1432        res.inside_cost = INT_MAX;
1433        res.outside_cost = INT_MAX;
1434        peeling_htab->traverse <_vect_peel_extended_info *,
1435                                vect_peeling_hash_get_lowest_cost> (&res);
1436      }
1437    else
1438      {
1439        res.peel_info.count = 0;
1440        peeling_htab->traverse <_vect_peel_extended_info *,
1441                                vect_peeling_hash_get_most_frequent> (&res);
1442        res.inside_cost = 0;
1443        res.outside_cost = 0;
1444      }
1445
1446    return res;
1447 }
1448
1449 /* Return true if the new peeling NPEEL is supported.  */
1450
1451 static bool
1452 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1453                           unsigned npeel)
1454 {
1455   unsigned i;
1456   struct data_reference *dr = NULL;
1457   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1458   enum dr_alignment_support supportable_dr_alignment;
1459
1460   /* Ensure that all data refs can be vectorized after the peel.  */
1461   FOR_EACH_VEC_ELT (datarefs, i, dr)
1462     {
1463       int save_misalignment;
1464
1465       if (dr == dr0_info->dr)
1466         continue;
1467
1468       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1469       stmt_vec_info stmt_info = dr_info->stmt;
1470       /* For interleaving, only the alignment of the first access
1471          matters.  */
1472       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1473           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1474         continue;
1475
1476       /* Strided accesses perform only component accesses, alignment is
1477          irrelevant for them.  */
1478       if (STMT_VINFO_STRIDED_P (stmt_info)
1479           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1480         continue;
1481
1482       save_misalignment = DR_MISALIGNMENT (dr_info);
1483       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1484       supportable_dr_alignment
1485         = vect_supportable_dr_alignment (dr_info, false);
1486       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1487
1488       if (!supportable_dr_alignment)
1489         return false;
1490     }
1491
1492   return true;
1493 }
1494
1495 /* Function vect_enhance_data_refs_alignment
1496
1497    This pass will use loop versioning and loop peeling in order to enhance
1498    the alignment of data references in the loop.
1499
1500    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1501    original loop is to be vectorized.  Any other loops that are created by
1502    the transformations performed in this pass - are not supposed to be
1503    vectorized.  This restriction will be relaxed.
1504
1505    This pass will require a cost model to guide it whether to apply peeling
1506    or versioning or a combination of the two.  For example, the scheme that
1507    intel uses when given a loop with several memory accesses, is as follows:
1508    choose one memory access ('p') which alignment you want to force by doing
1509    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1510    other accesses are not necessarily aligned, or (2) use loop versioning to
1511    generate one loop in which all accesses are aligned, and another loop in
1512    which only 'p' is necessarily aligned.
1513
1514    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1515    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1516    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1517
1518    Devising a cost model is the most critical aspect of this work.  It will
1519    guide us on which access to peel for, whether to use loop versioning, how
1520    many versions to create, etc.  The cost model will probably consist of
1521    generic considerations as well as target specific considerations (on
1522    powerpc for example, misaligned stores are more painful than misaligned
1523    loads).
1524
1525    Here are the general steps involved in alignment enhancements:
1526
1527      -- original loop, before alignment analysis:
1528         for (i=0; i<N; i++){
1529           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1530           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1531         }
1532
1533      -- After vect_compute_data_refs_alignment:
1534         for (i=0; i<N; i++){
1535           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1536           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1537         }
1538
1539      -- Possibility 1: we do loop versioning:
1540      if (p is aligned) {
1541         for (i=0; i<N; i++){    # loop 1A
1542           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1543           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1544         }
1545      }
1546      else {
1547         for (i=0; i<N; i++){    # loop 1B
1548           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1549           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1550         }
1551      }
1552
1553      -- Possibility 2: we do loop peeling:
1554      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1555         x = q[i];
1556         p[i] = y;
1557      }
1558      for (i = 3; i < N; i++){   # loop 2A
1559         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1560         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1561      }
1562
1563      -- Possibility 3: combination of loop peeling and versioning:
1564      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1565         x = q[i];
1566         p[i] = y;
1567      }
1568      if (p is aligned) {
1569         for (i = 3; i<N; i++){  # loop 3A
1570           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1571           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1572         }
1573      }
1574      else {
1575         for (i = 3; i<N; i++){  # loop 3B
1576           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1577           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1578         }
1579      }
1580
1581      These loops are later passed to loop_transform to be vectorized.  The
1582      vectorizer will use the alignment information to guide the transformation
1583      (whether to generate regular loads/stores, or with special handling for
1584      misalignment).  */
1585
1586 opt_result
1587 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1588 {
1589   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1590   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1591   enum dr_alignment_support supportable_dr_alignment;
1592   dr_vec_info *first_store = NULL;
1593   dr_vec_info *dr0_info = NULL;
1594   struct data_reference *dr;
1595   unsigned int i, j;
1596   bool do_peeling = false;
1597   bool do_versioning = false;
1598   unsigned int npeel = 0;
1599   bool one_misalignment_known = false;
1600   bool one_misalignment_unknown = false;
1601   bool one_dr_unsupportable = false;
1602   dr_vec_info *unsupportable_dr_info = NULL;
1603   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1604   unsigned possible_npeel_number = 1;
1605   tree vectype;
1606   unsigned int mis, same_align_drs_max = 0;
1607   hash_table<peel_info_hasher> peeling_htab (1);
1608
1609   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1610
1611   /* Reset data so we can safely be called multiple times.  */
1612   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1613   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1614
1615   /* While cost model enhancements are expected in the future, the high level
1616      view of the code at this time is as follows:
1617
1618      A) If there is a misaligned access then see if peeling to align
1619         this access can make all data references satisfy
1620         vect_supportable_dr_alignment.  If so, update data structures
1621         as needed and return true.
1622
1623      B) If peeling wasn't possible and there is a data reference with an
1624         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1625         then see if loop versioning checks can be used to make all data
1626         references satisfy vect_supportable_dr_alignment.  If so, update
1627         data structures as needed and return true.
1628
1629      C) If neither peeling nor versioning were successful then return false if
1630         any data reference does not satisfy vect_supportable_dr_alignment.
1631
1632      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1633
1634      Note, Possibility 3 above (which is peeling and versioning together) is not
1635      being done at this time.  */
1636
1637   /* (1) Peeling to force alignment.  */
1638
1639   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1640      Considerations:
1641      + How many accesses will become aligned due to the peeling
1642      - How many accesses will become unaligned due to the peeling,
1643        and the cost of misaligned accesses.
1644      - The cost of peeling (the extra runtime checks, the increase
1645        in code size).  */
1646
1647   FOR_EACH_VEC_ELT (datarefs, i, dr)
1648     {
1649       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1650       stmt_vec_info stmt_info = dr_info->stmt;
1651
1652       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1653         continue;
1654
1655       /* For interleaving, only the alignment of the first access
1656          matters.  */
1657       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1658           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1659         continue;
1660
1661       /* For scatter-gather or invariant accesses there is nothing
1662          to enhance.  */
1663       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1664           || integer_zerop (DR_STEP (dr)))
1665         continue;
1666
1667       /* Strided accesses perform only component accesses, alignment is
1668          irrelevant for them.  */
1669       if (STMT_VINFO_STRIDED_P (stmt_info)
1670           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1671         continue;
1672
1673       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1674       do_peeling = vector_alignment_reachable_p (dr_info);
1675       if (do_peeling)
1676         {
1677           if (known_alignment_for_access_p (dr_info))
1678             {
1679               unsigned int npeel_tmp = 0;
1680               bool negative = tree_int_cst_compare (DR_STEP (dr),
1681                                                     size_zero_node) < 0;
1682
1683               vectype = STMT_VINFO_VECTYPE (stmt_info);
1684               /* If known_alignment_for_access_p then we have set
1685                  DR_MISALIGNMENT which is only done if we know it at compiler
1686                  time, so it is safe to assume target alignment is constant.
1687                */
1688               unsigned int target_align =
1689                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1690               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1691               mis = (negative
1692                      ? DR_MISALIGNMENT (dr_info)
1693                      : -DR_MISALIGNMENT (dr_info));
1694               if (DR_MISALIGNMENT (dr_info) != 0)
1695                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1696
1697               /* For multiple types, it is possible that the bigger type access
1698                  will have more than one peeling option.  E.g., a loop with two
1699                  types: one of size (vector size / 4), and the other one of
1700                  size (vector size / 8).  Vectorization factor will 8.  If both
1701                  accesses are misaligned by 3, the first one needs one scalar
1702                  iteration to be aligned, and the second one needs 5.  But the
1703                  first one will be aligned also by peeling 5 scalar
1704                  iterations, and in that case both accesses will be aligned.
1705                  Hence, except for the immediate peeling amount, we also want
1706                  to try to add full vector size, while we don't exceed
1707                  vectorization factor.
1708                  We do this automatically for cost model, since we calculate
1709                  cost for every peeling option.  */
1710               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1711                 {
1712                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1713                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1714                   possible_npeel_number
1715                     = vect_get_num_vectors (nscalars, vectype);
1716
1717                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1718                      allow peeling NELEMENTS.  */
1719                   if (DR_MISALIGNMENT (dr_info) == 0)
1720                     possible_npeel_number++;
1721                 }
1722
1723               /* Save info about DR in the hash table.  Also include peeling
1724                  amounts according to the explanation above.  */
1725               for (j = 0; j < possible_npeel_number; j++)
1726                 {
1727                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1728                                             dr_info, npeel_tmp);
1729                   npeel_tmp += target_align / dr_size;
1730                 }
1731
1732               one_misalignment_known = true;
1733             }
1734           else
1735             {
1736               /* If we don't know any misalignment values, we prefer
1737                  peeling for data-ref that has the maximum number of data-refs
1738                  with the same alignment, unless the target prefers to align
1739                  stores over load.  */
1740               unsigned same_align_drs
1741                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1742               if (!dr0_info
1743                   || same_align_drs_max < same_align_drs)
1744                 {
1745                   same_align_drs_max = same_align_drs;
1746                   dr0_info = dr_info;
1747                 }
1748               /* For data-refs with the same number of related
1749                  accesses prefer the one where the misalign
1750                  computation will be invariant in the outermost loop.  */
1751               else if (same_align_drs_max == same_align_drs)
1752                 {
1753                   struct loop *ivloop0, *ivloop;
1754                   ivloop0 = outermost_invariant_loop_for_expr
1755                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1756                   ivloop = outermost_invariant_loop_for_expr
1757                     (loop, DR_BASE_ADDRESS (dr));
1758                   if ((ivloop && !ivloop0)
1759                       || (ivloop && ivloop0
1760                           && flow_loop_nested_p (ivloop, ivloop0)))
1761                     dr0_info = dr_info;
1762                 }
1763
1764               one_misalignment_unknown = true;
1765
1766               /* Check for data refs with unsupportable alignment that
1767                  can be peeled.  */
1768               if (!supportable_dr_alignment)
1769               {
1770                 one_dr_unsupportable = true;
1771                 unsupportable_dr_info = dr_info;
1772               }
1773
1774               if (!first_store && DR_IS_WRITE (dr))
1775                 first_store = dr_info;
1776             }
1777         }
1778       else
1779         {
1780           if (!aligned_access_p (dr_info))
1781             {
1782               if (dump_enabled_p ())
1783                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                                  "vector alignment may not be reachable\n");
1785               break;
1786             }
1787         }
1788     }
1789
1790   /* Check if we can possibly peel the loop.  */
1791   if (!vect_can_advance_ivs_p (loop_vinfo)
1792       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1793       || loop->inner)
1794     do_peeling = false;
1795
1796   struct _vect_peel_extended_info peel_for_known_alignment;
1797   struct _vect_peel_extended_info peel_for_unknown_alignment;
1798   struct _vect_peel_extended_info best_peel;
1799
1800   peel_for_unknown_alignment.inside_cost = INT_MAX;
1801   peel_for_unknown_alignment.outside_cost = INT_MAX;
1802   peel_for_unknown_alignment.peel_info.count = 0;
1803
1804   if (do_peeling
1805       && one_misalignment_unknown)
1806     {
1807       /* Check if the target requires to prefer stores over loads, i.e., if
1808          misaligned stores are more expensive than misaligned loads (taking
1809          drs with same alignment into account).  */
1810       unsigned int load_inside_cost = 0;
1811       unsigned int load_outside_cost = 0;
1812       unsigned int store_inside_cost = 0;
1813       unsigned int store_outside_cost = 0;
1814       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1815
1816       stmt_vector_for_cost dummy;
1817       dummy.create (2);
1818       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1819                                       &load_inside_cost,
1820                                       &load_outside_cost,
1821                                       &dummy, &dummy, estimated_npeels, true);
1822       dummy.release ();
1823
1824       if (first_store)
1825         {
1826           dummy.create (2);
1827           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1828                                           &store_inside_cost,
1829                                           &store_outside_cost,
1830                                           &dummy, &dummy,
1831                                           estimated_npeels, true);
1832           dummy.release ();
1833         }
1834       else
1835         {
1836           store_inside_cost = INT_MAX;
1837           store_outside_cost = INT_MAX;
1838         }
1839
1840       if (load_inside_cost > store_inside_cost
1841           || (load_inside_cost == store_inside_cost
1842               && load_outside_cost > store_outside_cost))
1843         {
1844           dr0_info = first_store;
1845           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1846           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1847         }
1848       else
1849         {
1850           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1851           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1852         }
1853
1854       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1855       prologue_cost_vec.create (2);
1856       epilogue_cost_vec.create (2);
1857
1858       int dummy2;
1859       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1860         (loop_vinfo, estimated_npeels, &dummy2,
1861          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1862          &prologue_cost_vec, &epilogue_cost_vec);
1863
1864       prologue_cost_vec.release ();
1865       epilogue_cost_vec.release ();
1866
1867       peel_for_unknown_alignment.peel_info.count = 1
1868         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1869     }
1870
1871   peel_for_unknown_alignment.peel_info.npeel = 0;
1872   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1873
1874   best_peel = peel_for_unknown_alignment;
1875
1876   peel_for_known_alignment.inside_cost = INT_MAX;
1877   peel_for_known_alignment.outside_cost = INT_MAX;
1878   peel_for_known_alignment.peel_info.count = 0;
1879   peel_for_known_alignment.peel_info.dr_info = NULL;
1880
1881   if (do_peeling && one_misalignment_known)
1882     {
1883       /* Peeling is possible, but there is no data access that is not supported
1884          unless aligned.  So we try to choose the best possible peeling from
1885          the hash table.  */
1886       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1887         (&peeling_htab, loop_vinfo);
1888     }
1889
1890   /* Compare costs of peeling for known and unknown alignment. */
1891   if (peel_for_known_alignment.peel_info.dr_info != NULL
1892       && peel_for_unknown_alignment.inside_cost
1893       >= peel_for_known_alignment.inside_cost)
1894     {
1895       best_peel = peel_for_known_alignment;
1896
1897       /* If the best peeling for known alignment has NPEEL == 0, perform no
1898          peeling at all except if there is an unsupportable dr that we can
1899          align.  */
1900       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1901         do_peeling = false;
1902     }
1903
1904   /* If there is an unsupportable data ref, prefer this over all choices so far
1905      since we'd have to discard a chosen peeling except when it accidentally
1906      aligned the unsupportable data ref.  */
1907   if (one_dr_unsupportable)
1908     dr0_info = unsupportable_dr_info;
1909   else if (do_peeling)
1910     {
1911       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1912          TODO: Use nopeel_outside_cost or get rid of it?  */
1913       unsigned nopeel_inside_cost = 0;
1914       unsigned nopeel_outside_cost = 0;
1915
1916       stmt_vector_for_cost dummy;
1917       dummy.create (2);
1918       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1919                                       &nopeel_outside_cost, &dummy, &dummy,
1920                                       0, false);
1921       dummy.release ();
1922
1923       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1924          costs will be recorded.  */
1925       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1926       prologue_cost_vec.create (2);
1927       epilogue_cost_vec.create (2);
1928
1929       int dummy2;
1930       nopeel_outside_cost += vect_get_known_peeling_cost
1931         (loop_vinfo, 0, &dummy2,
1932          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1933          &prologue_cost_vec, &epilogue_cost_vec);
1934
1935       prologue_cost_vec.release ();
1936       epilogue_cost_vec.release ();
1937
1938       npeel = best_peel.peel_info.npeel;
1939       dr0_info = best_peel.peel_info.dr_info;
1940
1941       /* If no peeling is not more expensive than the best peeling we
1942          have so far, don't perform any peeling.  */
1943       if (nopeel_inside_cost <= best_peel.inside_cost)
1944         do_peeling = false;
1945     }
1946
1947   if (do_peeling)
1948     {
1949       stmt_vec_info stmt_info = dr0_info->stmt;
1950       vectype = STMT_VINFO_VECTYPE (stmt_info);
1951
1952       if (known_alignment_for_access_p (dr0_info))
1953         {
1954           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1955                                                 size_zero_node) < 0;
1956           if (!npeel)
1957             {
1958               /* Since it's known at compile time, compute the number of
1959                  iterations in the peeled loop (the peeling factor) for use in
1960                  updating DR_MISALIGNMENT values.  The peeling factor is the
1961                  vectorization factor minus the misalignment as an element
1962                  count.  */
1963               mis = (negative
1964                      ? DR_MISALIGNMENT (dr0_info)
1965                      : -DR_MISALIGNMENT (dr0_info));
1966               /* If known_alignment_for_access_p then we have set
1967                  DR_MISALIGNMENT which is only done if we know it at compiler
1968                  time, so it is safe to assume target alignment is constant.
1969                */
1970               unsigned int target_align =
1971                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
1972               npeel = ((mis & (target_align - 1))
1973                        / vect_get_scalar_dr_size (dr0_info));
1974             }
1975
1976           /* For interleaved data access every iteration accesses all the
1977              members of the group, therefore we divide the number of iterations
1978              by the group size.  */
1979           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1980             npeel /= DR_GROUP_SIZE (stmt_info);
1981
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_NOTE, vect_location,
1984                              "Try peeling by %d\n", npeel);
1985         }
1986
1987       /* Ensure that all datarefs can be vectorized after the peel.  */
1988       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1989         do_peeling = false;
1990
1991       /* Check if all datarefs are supportable and log.  */
1992       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1993         {
1994           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1995           if (!stat)
1996             do_peeling = false;
1997           else
1998             return stat;
1999         }
2000
2001       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2002       if (do_peeling)
2003         {
2004           unsigned max_allowed_peel
2005             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2006           if (max_allowed_peel != (unsigned)-1)
2007             {
2008               unsigned max_peel = npeel;
2009               if (max_peel == 0)
2010                 {
2011                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2012                   unsigned HOST_WIDE_INT target_align_c;
2013                   if (target_align.is_constant (&target_align_c))
2014                     max_peel =
2015                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2016                   else
2017                     {
2018                       do_peeling = false;
2019                       if (dump_enabled_p ())
2020                         dump_printf_loc (MSG_NOTE, vect_location,
2021                           "Disable peeling, max peels set and vector"
2022                           " alignment unknown\n");
2023                     }
2024                 }
2025               if (max_peel > max_allowed_peel)
2026                 {
2027                   do_peeling = false;
2028                   if (dump_enabled_p ())
2029                     dump_printf_loc (MSG_NOTE, vect_location,
2030                         "Disable peeling, max peels reached: %d\n", max_peel);
2031                 }
2032             }
2033         }
2034
2035       /* Cost model #2 - if peeling may result in a remaining loop not
2036          iterating enough to be vectorized then do not peel.  Since this
2037          is a cost heuristic rather than a correctness decision, use the
2038          most likely runtime value for variable vectorization factors.  */
2039       if (do_peeling
2040           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2041         {
2042           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2043           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2044           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2045               < assumed_vf + max_peel)
2046             do_peeling = false;
2047         }
2048
2049       if (do_peeling)
2050         {
2051           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2052              If the misalignment of DR_i is identical to that of dr0 then set
2053              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2054              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2055              by the peeling factor times the element size of DR_i (MOD the
2056              vectorization factor times the size).  Otherwise, the
2057              misalignment of DR_i must be set to unknown.  */
2058           FOR_EACH_VEC_ELT (datarefs, i, dr)
2059             if (dr != dr0_info->dr)
2060               {
2061                 /* Strided accesses perform only component accesses, alignment
2062                    is irrelevant for them.  */
2063                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2064                 stmt_info = dr_info->stmt;
2065                 if (STMT_VINFO_STRIDED_P (stmt_info)
2066                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2067                   continue;
2068
2069                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2070               }
2071
2072           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2073           if (npeel)
2074             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2075           else
2076             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2077               = DR_MISALIGNMENT (dr0_info);
2078           SET_DR_MISALIGNMENT (dr0_info, 0);
2079           if (dump_enabled_p ())
2080             {
2081               dump_printf_loc (MSG_NOTE, vect_location,
2082                                "Alignment of access forced using peeling.\n");
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Peeling for alignment will be applied.\n");
2085             }
2086
2087           /* The inside-loop cost will be accounted for in vectorizable_load
2088              and vectorizable_store correctly with adjusted alignments.
2089              Drop the body_cst_vec on the floor here.  */
2090           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2091           gcc_assert (stat);
2092           return stat;
2093         }
2094     }
2095
2096   /* (2) Versioning to force alignment.  */
2097
2098   /* Try versioning if:
2099      1) optimize loop for speed
2100      2) there is at least one unsupported misaligned data ref with an unknown
2101         misalignment, and
2102      3) all misaligned data refs with a known misalignment are supported, and
2103      4) the number of runtime alignment checks is within reason.  */
2104
2105   do_versioning =
2106         optimize_loop_nest_for_speed_p (loop)
2107         && (!loop->inner); /* FORNOW */
2108
2109   if (do_versioning)
2110     {
2111       FOR_EACH_VEC_ELT (datarefs, i, dr)
2112         {
2113           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2114           stmt_vec_info stmt_info = dr_info->stmt;
2115
2116           /* For interleaving, only the alignment of the first access
2117              matters.  */
2118           if (aligned_access_p (dr_info)
2119               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2120                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2121             continue;
2122
2123           if (STMT_VINFO_STRIDED_P (stmt_info))
2124             {
2125               /* Strided loads perform only component accesses, alignment is
2126                  irrelevant for them.  */
2127               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2128                 continue;
2129               do_versioning = false;
2130               break;
2131             }
2132
2133           supportable_dr_alignment
2134             = vect_supportable_dr_alignment (dr_info, false);
2135
2136           if (!supportable_dr_alignment)
2137             {
2138               int mask;
2139               tree vectype;
2140
2141               if (known_alignment_for_access_p (dr_info)
2142                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2143                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2144                 {
2145                   do_versioning = false;
2146                   break;
2147                 }
2148
2149               vectype = STMT_VINFO_VECTYPE (stmt_info);
2150               gcc_assert (vectype);
2151
2152               /* At present we don't support versioning for alignment
2153                  with variable VF, since there's no guarantee that the
2154                  VF is a power of two.  We could relax this if we added
2155                  a way of enforcing a power-of-two size.  */
2156               unsigned HOST_WIDE_INT size;
2157               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2158                 {
2159                   do_versioning = false;
2160                   break;
2161                 }
2162
2163               /* Forcing alignment in the first iteration is no good if
2164                  we don't keep it across iterations.  For now, just disable
2165                  versioning in this case.
2166                  ?? We could actually unroll the loop to achieve the required
2167                  overall step alignment, and forcing the alignment could be
2168                  done by doing some iterations of the non-vectorized loop.  */
2169               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2170                                * DR_STEP_ALIGNMENT (dr),
2171                                DR_TARGET_ALIGNMENT (dr_info)))
2172                 {
2173                   do_versioning = false;
2174                   break;
2175                 }
2176
2177               /* The rightmost bits of an aligned address must be zeros.
2178                  Construct the mask needed for this test.  For example,
2179                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2180                  mask must be 15 = 0xf. */
2181               mask = size - 1;
2182
2183               /* FORNOW: use the same mask to test all potentially unaligned
2184                  references in the loop.  The vectorizer currently supports
2185                  a single vector size, see the reference to
2186                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2187                  vectorization factor is computed.  */
2188               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2189                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2190               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2191               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2192             }
2193         }
2194
2195       /* Versioning requires at least one misaligned data reference.  */
2196       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2197         do_versioning = false;
2198       else if (!do_versioning)
2199         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2200     }
2201
2202   if (do_versioning)
2203     {
2204       vec<stmt_vec_info> may_misalign_stmts
2205         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2206       stmt_vec_info stmt_info;
2207
2208       /* It can now be assumed that the data references in the statements
2209          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2210          of the loop being vectorized.  */
2211       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2212         {
2213           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2214           SET_DR_MISALIGNMENT (dr_info, 0);
2215           if (dump_enabled_p ())
2216             dump_printf_loc (MSG_NOTE, vect_location,
2217                              "Alignment of access forced using versioning.\n");
2218         }
2219
2220       if (dump_enabled_p ())
2221         dump_printf_loc (MSG_NOTE, vect_location,
2222                          "Versioning for alignment will be applied.\n");
2223
2224       /* Peeling and versioning can't be done together at this time.  */
2225       gcc_assert (! (do_peeling && do_versioning));
2226
2227       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2228       gcc_assert (stat);
2229       return stat;
2230     }
2231
2232   /* This point is reached if neither peeling nor versioning is being done.  */
2233   gcc_assert (! (do_peeling || do_versioning));
2234
2235   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2236   return stat;
2237 }
2238
2239
2240 /* Function vect_find_same_alignment_drs.
2241
2242    Update group and alignment relations in VINFO according to the chosen
2243    vectorization factor.  */
2244
2245 static void
2246 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2247 {
2248   struct data_reference *dra = DDR_A (ddr);
2249   struct data_reference *drb = DDR_B (ddr);
2250   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2251   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2252   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2253   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2254
2255   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2256     return;
2257
2258   if (dra == drb)
2259     return;
2260
2261   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2262       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2263     return;
2264
2265   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2266       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2267       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2268     return;
2269
2270   /* Two references with distance zero have the same alignment.  */
2271   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2272                           - wi::to_poly_offset (DR_INIT (drb)));
2273   if (maybe_ne (diff, 0))
2274     {
2275       /* Get the wider of the two alignments.  */
2276       poly_uint64 align_a =
2277         exact_div (vect_calculate_target_alignment (dr_info_a),
2278                    BITS_PER_UNIT);
2279       poly_uint64 align_b =
2280         exact_div (vect_calculate_target_alignment (dr_info_b),
2281                    BITS_PER_UNIT);
2282       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2283       if (!align_a.is_constant (&align_a_c)
2284           || !align_b.is_constant (&align_b_c))
2285         return;
2286
2287       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2288
2289       /* Require the gap to be a multiple of the larger vector alignment.  */
2290       if (!multiple_p (diff, max_align))
2291         return;
2292     }
2293
2294   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2295   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2296   if (dump_enabled_p ())
2297     dump_printf_loc (MSG_NOTE, vect_location,
2298                      "accesses have the same alignment: %T and %T\n",
2299                      DR_REF (dra), DR_REF (drb));
2300 }
2301
2302
2303 /* Function vect_analyze_data_refs_alignment
2304
2305    Analyze the alignment of the data-references in the loop.
2306    Return FALSE if a data reference is found that cannot be vectorized.  */
2307
2308 opt_result
2309 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2310 {
2311   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2312
2313   /* Mark groups of data references with same alignment using
2314      data dependence information.  */
2315   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2316   struct data_dependence_relation *ddr;
2317   unsigned int i;
2318
2319   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2320     vect_find_same_alignment_drs (vinfo, ddr);
2321
2322   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2323   struct data_reference *dr;
2324
2325   vect_record_base_alignments (vinfo);
2326   FOR_EACH_VEC_ELT (datarefs, i, dr)
2327     {
2328       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2329       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2330         vect_compute_data_ref_alignment (dr_info);
2331     }
2332
2333   return opt_result::success ();
2334 }
2335
2336
2337 /* Analyze alignment of DRs of stmts in NODE.  */
2338
2339 static bool
2340 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2341 {
2342   /* We vectorize from the first scalar stmt in the node unless
2343      the node is permuted in which case we start from the first
2344      element in the group.  */
2345   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2346   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2347   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2348     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2349
2350   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2351   vect_compute_data_ref_alignment (dr_info);
2352   /* For creating the data-ref pointer we need alignment of the
2353      first element anyway.  */
2354   if (dr_info != first_dr_info)
2355     vect_compute_data_ref_alignment (first_dr_info);
2356   if (! verify_data_ref_alignment (dr_info))
2357     {
2358       if (dump_enabled_p ())
2359         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2360                          "not vectorized: bad data alignment in basic "
2361                          "block.\n");
2362       return false;
2363     }
2364
2365   return true;
2366 }
2367
2368 /* Function vect_slp_analyze_instance_alignment
2369
2370    Analyze the alignment of the data-references in the SLP instance.
2371    Return FALSE if a data reference is found that cannot be vectorized.  */
2372
2373 bool
2374 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2375 {
2376   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2377
2378   slp_tree node;
2379   unsigned i;
2380   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2381     if (! vect_slp_analyze_and_verify_node_alignment (node))
2382       return false;
2383
2384   node = SLP_INSTANCE_TREE (instance);
2385   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2386       && ! vect_slp_analyze_and_verify_node_alignment
2387              (SLP_INSTANCE_TREE (instance)))
2388     return false;
2389
2390   return true;
2391 }
2392
2393
2394 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2395    accesses of legal size, step, etc.  Detect gaps, single element
2396    interleaving, and other special cases. Set grouped access info.
2397    Collect groups of strided stores for further use in SLP analysis.
2398    Worker for vect_analyze_group_access.  */
2399
2400 static bool
2401 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2402 {
2403   data_reference *dr = dr_info->dr;
2404   tree step = DR_STEP (dr);
2405   tree scalar_type = TREE_TYPE (DR_REF (dr));
2406   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2407   stmt_vec_info stmt_info = dr_info->stmt;
2408   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2409   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2410   HOST_WIDE_INT dr_step = -1;
2411   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2412   bool slp_impossible = false;
2413
2414   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2415      size of the interleaving group (including gaps).  */
2416   if (tree_fits_shwi_p (step))
2417     {
2418       dr_step = tree_to_shwi (step);
2419       /* Check that STEP is a multiple of type size.  Otherwise there is
2420          a non-element-sized gap at the end of the group which we
2421          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2422          ???  As we can handle non-constant step fine here we should
2423          simply remove uses of DR_GROUP_GAP between the last and first
2424          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2425          simply not include that gap.  */
2426       if ((dr_step % type_size) != 0)
2427         {
2428           if (dump_enabled_p ())
2429             dump_printf_loc (MSG_NOTE, vect_location,
2430                              "Step %T is not a multiple of the element size"
2431                              " for %T\n",
2432                              step, DR_REF (dr));
2433           return false;
2434         }
2435       groupsize = absu_hwi (dr_step) / type_size;
2436     }
2437   else
2438     groupsize = 0;
2439
2440   /* Not consecutive access is possible only if it is a part of interleaving.  */
2441   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2442     {
2443       /* Check if it this DR is a part of interleaving, and is a single
2444          element of the group that is accessed in the loop.  */
2445
2446       /* Gaps are supported only for loads. STEP must be a multiple of the type
2447          size.  */
2448       if (DR_IS_READ (dr)
2449           && (dr_step % type_size) == 0
2450           && groupsize > 0)
2451         {
2452           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2453           DR_GROUP_SIZE (stmt_info) = groupsize;
2454           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2455           if (dump_enabled_p ())
2456             dump_printf_loc (MSG_NOTE, vect_location,
2457                              "Detected single element interleaving %T"
2458                              " step %T\n",
2459                              DR_REF (dr), step);
2460
2461           return true;
2462         }
2463
2464       if (dump_enabled_p ())
2465         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2466                          "not consecutive access %G", stmt_info->stmt);
2467
2468       if (bb_vinfo)
2469         {
2470           /* Mark the statement as unvectorizable.  */
2471           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2472           return true;
2473         }
2474
2475       if (dump_enabled_p ())
2476         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2477       STMT_VINFO_STRIDED_P (stmt_info) = true;
2478       return true;
2479     }
2480
2481   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2482     {
2483       /* First stmt in the interleaving chain. Check the chain.  */
2484       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2485       struct data_reference *data_ref = dr;
2486       unsigned int count = 1;
2487       tree prev_init = DR_INIT (data_ref);
2488       stmt_vec_info prev = stmt_info;
2489       HOST_WIDE_INT diff, gaps = 0;
2490
2491       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2492       while (next)
2493         {
2494           /* Skip same data-refs.  In case that two or more stmts share
2495              data-ref (supported only for loads), we vectorize only the first
2496              stmt, and the rest get their vectorized loads from the first
2497              one.  */
2498           if (!tree_int_cst_compare (DR_INIT (data_ref),
2499                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2500             {
2501               if (DR_IS_WRITE (data_ref))
2502                 {
2503                   if (dump_enabled_p ())
2504                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505                                      "Two store stmts share the same dr.\n");
2506                   return false;
2507                 }
2508
2509               if (dump_enabled_p ())
2510                 dump_printf_loc (MSG_NOTE, vect_location,
2511                                  "Two or more load stmts share the same dr.\n");
2512
2513               /* For load use the same data-ref load.  */
2514               DR_GROUP_SAME_DR_STMT (next) = prev;
2515
2516               prev = next;
2517               next = DR_GROUP_NEXT_ELEMENT (next);
2518               continue;
2519             }
2520
2521           prev = next;
2522           data_ref = STMT_VINFO_DATA_REF (next);
2523
2524           /* All group members have the same STEP by construction.  */
2525           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2526
2527           /* Check that the distance between two accesses is equal to the type
2528              size. Otherwise, we have gaps.  */
2529           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2530                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2531           if (diff != 1)
2532             {
2533               /* FORNOW: SLP of accesses with gaps is not supported.  */
2534               slp_impossible = true;
2535               if (DR_IS_WRITE (data_ref))
2536                 {
2537                   if (dump_enabled_p ())
2538                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2539                                      "interleaved store with gaps\n");
2540                   return false;
2541                 }
2542
2543               gaps += diff - 1;
2544             }
2545
2546           last_accessed_element += diff;
2547
2548           /* Store the gap from the previous member of the group. If there is no
2549              gap in the access, DR_GROUP_GAP is always 1.  */
2550           DR_GROUP_GAP (next) = diff;
2551
2552           prev_init = DR_INIT (data_ref);
2553           next = DR_GROUP_NEXT_ELEMENT (next);
2554           /* Count the number of data-refs in the chain.  */
2555           count++;
2556         }
2557
2558       if (groupsize == 0)
2559         groupsize = count + gaps;
2560
2561       /* This could be UINT_MAX but as we are generating code in a very
2562          inefficient way we have to cap earlier.  See PR78699 for example.  */
2563       if (groupsize > 4096)
2564         {
2565           if (dump_enabled_p ())
2566             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567                              "group is too large\n");
2568           return false;
2569         }
2570
2571       /* Check that the size of the interleaving is equal to count for stores,
2572          i.e., that there are no gaps.  */
2573       if (groupsize != count
2574           && !DR_IS_READ (dr))
2575         {
2576           groupsize = count;
2577           STMT_VINFO_STRIDED_P (stmt_info) = true;
2578         }
2579
2580       /* If there is a gap after the last load in the group it is the
2581          difference between the groupsize and the last accessed
2582          element.
2583          When there is no gap, this difference should be 0.  */
2584       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2585
2586       DR_GROUP_SIZE (stmt_info) = groupsize;
2587       if (dump_enabled_p ())
2588         {
2589           dump_printf_loc (MSG_NOTE, vect_location,
2590                            "Detected interleaving ");
2591           if (DR_IS_READ (dr))
2592             dump_printf (MSG_NOTE, "load ");
2593           else if (STMT_VINFO_STRIDED_P (stmt_info))
2594             dump_printf (MSG_NOTE, "strided store ");
2595           else
2596             dump_printf (MSG_NOTE, "store ");
2597           dump_printf (MSG_NOTE, "of size %u\n",
2598                        (unsigned)groupsize);
2599           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2600           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2601           while (next)
2602             {
2603               if (DR_GROUP_GAP (next) != 1)
2604                 dump_printf_loc (MSG_NOTE, vect_location,
2605                                  "\t<gap of %d elements>\n",
2606                                  DR_GROUP_GAP (next) - 1);
2607               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2608               next = DR_GROUP_NEXT_ELEMENT (next);
2609             }
2610           if (DR_GROUP_GAP (stmt_info) != 0)
2611             dump_printf_loc (MSG_NOTE, vect_location,
2612                              "\t<gap of %d elements>\n",
2613                              DR_GROUP_GAP (stmt_info));
2614         }
2615
2616       /* SLP: create an SLP data structure for every interleaving group of
2617          stores for further analysis in vect_analyse_slp.  */
2618       if (DR_IS_WRITE (dr) && !slp_impossible)
2619         {
2620           if (loop_vinfo)
2621             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2622           if (bb_vinfo)
2623             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2624         }
2625     }
2626
2627   return true;
2628 }
2629
2630 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2631    accesses of legal size, step, etc.  Detect gaps, single element
2632    interleaving, and other special cases. Set grouped access info.
2633    Collect groups of strided stores for further use in SLP analysis.  */
2634
2635 static bool
2636 vect_analyze_group_access (dr_vec_info *dr_info)
2637 {
2638   if (!vect_analyze_group_access_1 (dr_info))
2639     {
2640       /* Dissolve the group if present.  */
2641       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2642       while (stmt_info)
2643         {
2644           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2645           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2646           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2647           stmt_info = next;
2648         }
2649       return false;
2650     }
2651   return true;
2652 }
2653
2654 /* Analyze the access pattern of the data-reference DR_INFO.
2655    In case of non-consecutive accesses call vect_analyze_group_access() to
2656    analyze groups of accesses.  */
2657
2658 static bool
2659 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2660 {
2661   data_reference *dr = dr_info->dr;
2662   tree step = DR_STEP (dr);
2663   tree scalar_type = TREE_TYPE (DR_REF (dr));
2664   stmt_vec_info stmt_info = dr_info->stmt;
2665   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2666   struct loop *loop = NULL;
2667
2668   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2669     return true;
2670
2671   if (loop_vinfo)
2672     loop = LOOP_VINFO_LOOP (loop_vinfo);
2673
2674   if (loop_vinfo && !step)
2675     {
2676       if (dump_enabled_p ())
2677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2678                          "bad data-ref access in loop\n");
2679       return false;
2680     }
2681
2682   /* Allow loads with zero step in inner-loop vectorization.  */
2683   if (loop_vinfo && integer_zerop (step))
2684     {
2685       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2686       if (!nested_in_vect_loop_p (loop, stmt_info))
2687         return DR_IS_READ (dr);
2688       /* Allow references with zero step for outer loops marked
2689          with pragma omp simd only - it guarantees absence of
2690          loop-carried dependencies between inner loop iterations.  */
2691       if (loop->safelen < 2)
2692         {
2693           if (dump_enabled_p ())
2694             dump_printf_loc (MSG_NOTE, vect_location,
2695                              "zero step in inner loop of nest\n");
2696           return false;
2697         }
2698     }
2699
2700   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2701     {
2702       /* Interleaved accesses are not yet supported within outer-loop
2703         vectorization for references in the inner-loop.  */
2704       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2705
2706       /* For the rest of the analysis we use the outer-loop step.  */
2707       step = STMT_VINFO_DR_STEP (stmt_info);
2708       if (integer_zerop (step))
2709         {
2710           if (dump_enabled_p ())
2711             dump_printf_loc (MSG_NOTE, vect_location,
2712                              "zero step in outer loop.\n");
2713           return DR_IS_READ (dr);
2714         }
2715     }
2716
2717   /* Consecutive?  */
2718   if (TREE_CODE (step) == INTEGER_CST)
2719     {
2720       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2721       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2722           || (dr_step < 0
2723               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2724         {
2725           /* Mark that it is not interleaving.  */
2726           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2727           return true;
2728         }
2729     }
2730
2731   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2732     {
2733       if (dump_enabled_p ())
2734         dump_printf_loc (MSG_NOTE, vect_location,
2735                          "grouped access in outer loop.\n");
2736       return false;
2737     }
2738
2739
2740   /* Assume this is a DR handled by non-constant strided load case.  */
2741   if (TREE_CODE (step) != INTEGER_CST)
2742     return (STMT_VINFO_STRIDED_P (stmt_info)
2743             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2744                 || vect_analyze_group_access (dr_info)));
2745
2746   /* Not consecutive access - check if it's a part of interleaving group.  */
2747   return vect_analyze_group_access (dr_info);
2748 }
2749
2750 /* Compare two data-references DRA and DRB to group them into chunks
2751    suitable for grouping.  */
2752
2753 static int
2754 dr_group_sort_cmp (const void *dra_, const void *drb_)
2755 {
2756   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2757   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2758   int cmp;
2759
2760   /* Stabilize sort.  */
2761   if (dra == drb)
2762     return 0;
2763
2764   /* DRs in different loops never belong to the same group.  */
2765   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2766   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2767   if (loopa != loopb)
2768     return loopa->num < loopb->num ? -1 : 1;
2769
2770   /* Ordering of DRs according to base.  */
2771   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2772                                DR_BASE_ADDRESS (drb));
2773   if (cmp != 0)
2774     return cmp;
2775
2776   /* And according to DR_OFFSET.  */
2777   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2778   if (cmp != 0)
2779     return cmp;
2780
2781   /* Put reads before writes.  */
2782   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2783     return DR_IS_READ (dra) ? -1 : 1;
2784
2785   /* Then sort after access size.  */
2786   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2787                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2788   if (cmp != 0)
2789     return cmp;
2790
2791   /* And after step.  */
2792   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2793   if (cmp != 0)
2794     return cmp;
2795
2796   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2797   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2798   if (cmp == 0)
2799     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2800   return cmp;
2801 }
2802
2803 /* If OP is the result of a conversion, return the unconverted value,
2804    otherwise return null.  */
2805
2806 static tree
2807 strip_conversion (tree op)
2808 {
2809   if (TREE_CODE (op) != SSA_NAME)
2810     return NULL_TREE;
2811   gimple *stmt = SSA_NAME_DEF_STMT (op);
2812   if (!is_gimple_assign (stmt)
2813       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2814     return NULL_TREE;
2815   return gimple_assign_rhs1 (stmt);
2816 }
2817
2818 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2819    and STMT2_INFO being in a single group.  */
2820
2821 static bool
2822 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2823 {
2824   if (gimple_assign_single_p (stmt1_info->stmt))
2825     return gimple_assign_single_p (stmt2_info->stmt);
2826
2827   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2828   if (call1 && gimple_call_internal_p (call1))
2829     {
2830       /* Check for two masked loads or two masked stores.  */
2831       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2832       if (!call2 || !gimple_call_internal_p (call2))
2833         return false;
2834       internal_fn ifn = gimple_call_internal_fn (call1);
2835       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2836         return false;
2837       if (ifn != gimple_call_internal_fn (call2))
2838         return false;
2839
2840       /* Check that the masks are the same.  Cope with casts of masks,
2841          like those created by build_mask_conversion.  */
2842       tree mask1 = gimple_call_arg (call1, 2);
2843       tree mask2 = gimple_call_arg (call2, 2);
2844       if (!operand_equal_p (mask1, mask2, 0))
2845         {
2846           mask1 = strip_conversion (mask1);
2847           if (!mask1)
2848             return false;
2849           mask2 = strip_conversion (mask2);
2850           if (!mask2)
2851             return false;
2852           if (!operand_equal_p (mask1, mask2, 0))
2853             return false;
2854         }
2855       return true;
2856     }
2857
2858   return false;
2859 }
2860
2861 /* Function vect_analyze_data_ref_accesses.
2862
2863    Analyze the access pattern of all the data references in the loop.
2864
2865    FORNOW: the only access pattern that is considered vectorizable is a
2866            simple step 1 (consecutive) access.
2867
2868    FORNOW: handle only arrays and pointer accesses.  */
2869
2870 opt_result
2871 vect_analyze_data_ref_accesses (vec_info *vinfo)
2872 {
2873   unsigned int i;
2874   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2875   struct data_reference *dr;
2876
2877   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2878
2879   if (datarefs.is_empty ())
2880     return opt_result::success ();
2881
2882   /* Sort the array of datarefs to make building the interleaving chains
2883      linear.  Don't modify the original vector's order, it is needed for
2884      determining what dependencies are reversed.  */
2885   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2886   datarefs_copy.qsort (dr_group_sort_cmp);
2887   hash_set<stmt_vec_info> to_fixup;
2888
2889   /* Build the interleaving chains.  */
2890   for (i = 0; i < datarefs_copy.length () - 1;)
2891     {
2892       data_reference_p dra = datarefs_copy[i];
2893       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2894       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2895       stmt_vec_info lastinfo = NULL;
2896       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2897           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2898         {
2899           ++i;
2900           continue;
2901         }
2902       for (i = i + 1; i < datarefs_copy.length (); ++i)
2903         {
2904           data_reference_p drb = datarefs_copy[i];
2905           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2906           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2907           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2908               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2909             break;
2910
2911           /* ???  Imperfect sorting (non-compatible types, non-modulo
2912              accesses, same accesses) can lead to a group to be artificially
2913              split here as we don't just skip over those.  If it really
2914              matters we can push those to a worklist and re-iterate
2915              over them.  The we can just skip ahead to the next DR here.  */
2916
2917           /* DRs in a different loop should not be put into the same
2918              interleaving group.  */
2919           if (gimple_bb (DR_STMT (dra))->loop_father
2920               != gimple_bb (DR_STMT (drb))->loop_father)
2921             break;
2922
2923           /* Check that the data-refs have same first location (except init)
2924              and they are both either store or load (not load and store,
2925              not masked loads or stores).  */
2926           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2927               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2928                                         DR_BASE_ADDRESS (drb)) != 0
2929               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2930               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2931             break;
2932
2933           /* Check that the data-refs have the same constant size.  */
2934           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2935           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2936           if (!tree_fits_uhwi_p (sza)
2937               || !tree_fits_uhwi_p (szb)
2938               || !tree_int_cst_equal (sza, szb))
2939             break;
2940
2941           /* Check that the data-refs have the same step.  */
2942           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2943             break;
2944
2945           /* Check the types are compatible.
2946              ???  We don't distinguish this during sorting.  */
2947           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2948                                    TREE_TYPE (DR_REF (drb))))
2949             break;
2950
2951           /* Check that the DR_INITs are compile-time constants.  */
2952           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2953               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2954             break;
2955
2956           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2957           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2958           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2959           HOST_WIDE_INT init_prev
2960             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2961           gcc_assert (init_a <= init_b
2962                       && init_a <= init_prev
2963                       && init_prev <= init_b);
2964
2965           /* Do not place the same access in the interleaving chain twice.  */
2966           if (init_b == init_prev)
2967             {
2968               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2969                           < gimple_uid (DR_STMT (drb)));
2970               /* Simply link in duplicates and fix up the chain below.  */
2971             }
2972           else
2973             {
2974               /* If init_b == init_a + the size of the type * k, we have an
2975                  interleaving, and DRA is accessed before DRB.  */
2976               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2977               if (type_size_a == 0
2978                   || (init_b - init_a) % type_size_a != 0)
2979                 break;
2980
2981               /* If we have a store, the accesses are adjacent.  This splits
2982                  groups into chunks we support (we don't support vectorization
2983                  of stores with gaps).  */
2984               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2985                 break;
2986
2987               /* If the step (if not zero or non-constant) is greater than the
2988                  difference between data-refs' inits this splits groups into
2989                  suitable sizes.  */
2990               if (tree_fits_shwi_p (DR_STEP (dra)))
2991                 {
2992                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2993                   if (step != 0 && step <= (init_b - init_a))
2994                     break;
2995                 }
2996             }
2997
2998           if (dump_enabled_p ())
2999             dump_printf_loc (MSG_NOTE, vect_location,
3000                              DR_IS_READ (dra)
3001                              ? "Detected interleaving load %T and %T\n"
3002                              : "Detected interleaving store %T and %T\n",
3003                              DR_REF (dra), DR_REF (drb));
3004
3005           /* Link the found element into the group list.  */
3006           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3007             {
3008               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3009               lastinfo = stmtinfo_a;
3010             }
3011           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3012           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3013           lastinfo = stmtinfo_b;
3014
3015           if (init_b == init_prev
3016               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3017               && dump_enabled_p ())
3018             dump_printf_loc (MSG_NOTE, vect_location,
3019                              "Queuing group with duplicate access for fixup\n");
3020         }
3021     }
3022
3023   /* Fixup groups with duplicate entries by splitting it.  */
3024   while (1)
3025     {
3026       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3027       if (!(it != to_fixup.end ()))
3028         break;
3029       stmt_vec_info grp = *it;
3030       to_fixup.remove (grp);
3031
3032       /* Find the earliest duplicate group member.  */
3033       unsigned first_duplicate = -1u;
3034       stmt_vec_info next, g = grp;
3035       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3036         {
3037           if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
3038                == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3039               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3040             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3041           g = next;
3042         }
3043       if (first_duplicate == -1U)
3044         continue;
3045
3046       /* Then move all stmts after the first duplicate to a new group.
3047          Note this is a heuristic but one with the property that *it
3048          is fixed up completely.  */
3049       g = grp;
3050       stmt_vec_info newgroup = NULL, ng = grp;
3051       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3052         {
3053           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3054             {
3055               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3056               if (!newgroup)
3057                 newgroup = next;
3058               else
3059                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3060               ng = next;
3061               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3062             }
3063           else
3064             g = DR_GROUP_NEXT_ELEMENT (g);
3065         }
3066       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3067
3068       /* Fixup the new group which still may contain duplicates.  */
3069       to_fixup.add (newgroup);
3070     }
3071
3072   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3073     {
3074       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3075       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3076           && !vect_analyze_data_ref_access (dr_info))
3077         {
3078           if (dump_enabled_p ())
3079             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080                              "not vectorized: complicated access pattern.\n");
3081
3082           if (is_a <bb_vec_info> (vinfo))
3083             {
3084               /* Mark the statement as not vectorizable.  */
3085               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3086               continue;
3087             }
3088           else
3089             {
3090               datarefs_copy.release ();
3091               return opt_result::failure_at (dr_info->stmt->stmt,
3092                                              "not vectorized:"
3093                                              " complicated access pattern.\n");
3094             }
3095         }
3096     }
3097
3098   datarefs_copy.release ();
3099   return opt_result::success ();
3100 }
3101
3102 /* Function vect_vfa_segment_size.
3103
3104    Input:
3105      DR_INFO: The data reference.
3106      LENGTH_FACTOR: segment length to consider.
3107
3108    Return a value suitable for the dr_with_seg_len::seg_len field.
3109    This is the "distance travelled" by the pointer from the first
3110    iteration in the segment to the last.  Note that it does not include
3111    the size of the access; in effect it only describes the first byte.  */
3112
3113 static tree
3114 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3115 {
3116   length_factor = size_binop (MINUS_EXPR,
3117                               fold_convert (sizetype, length_factor),
3118                               size_one_node);
3119   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3120                      length_factor);
3121 }
3122
3123 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3124    gives the worst-case number of bytes covered by the segment.  */
3125
3126 static unsigned HOST_WIDE_INT
3127 vect_vfa_access_size (dr_vec_info *dr_info)
3128 {
3129   stmt_vec_info stmt_vinfo = dr_info->stmt;
3130   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3131   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3132   unsigned HOST_WIDE_INT access_size = ref_size;
3133   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3134     {
3135       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3136       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3137     }
3138   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3139       && (vect_supportable_dr_alignment (dr_info, false)
3140           == dr_explicit_realign_optimized))
3141     {
3142       /* We might access a full vector's worth.  */
3143       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3144       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3145     }
3146   return access_size;
3147 }
3148
3149 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3150    describes.  */
3151
3152 static unsigned int
3153 vect_vfa_align (dr_vec_info *dr_info)
3154 {
3155   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3156 }
3157
3158 /* Function vect_no_alias_p.
3159
3160    Given data references A and B with equal base and offset, see whether
3161    the alias relation can be decided at compilation time.  Return 1 if
3162    it can and the references alias, 0 if it can and the references do
3163    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3164    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3165    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3166
3167 static int
3168 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3169                          tree segment_length_a, tree segment_length_b,
3170                          unsigned HOST_WIDE_INT access_size_a,
3171                          unsigned HOST_WIDE_INT access_size_b)
3172 {
3173   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3174   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3175   poly_uint64 const_length_a;
3176   poly_uint64 const_length_b;
3177
3178   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3179      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3180      [a, a+12) */
3181   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3182     {
3183       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3184       offset_a = (offset_a + access_size_a) - const_length_a;
3185     }
3186   else
3187     const_length_a = tree_to_poly_uint64 (segment_length_a);
3188   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3189     {
3190       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3191       offset_b = (offset_b + access_size_b) - const_length_b;
3192     }
3193   else
3194     const_length_b = tree_to_poly_uint64 (segment_length_b);
3195
3196   const_length_a += access_size_a;
3197   const_length_b += access_size_b;
3198
3199   if (ranges_known_overlap_p (offset_a, const_length_a,
3200                               offset_b, const_length_b))
3201     return 1;
3202
3203   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3204                                offset_b, const_length_b))
3205     return 0;
3206
3207   return -1;
3208 }
3209
3210 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3211    in DDR is >= VF.  */
3212
3213 static bool
3214 dependence_distance_ge_vf (data_dependence_relation *ddr,
3215                            unsigned int loop_depth, poly_uint64 vf)
3216 {
3217   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3218       || DDR_NUM_DIST_VECTS (ddr) == 0)
3219     return false;
3220
3221   /* If the dependence is exact, we should have limited the VF instead.  */
3222   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3223
3224   unsigned int i;
3225   lambda_vector dist_v;
3226   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3227     {
3228       HOST_WIDE_INT dist = dist_v[loop_depth];
3229       if (dist != 0
3230           && !(dist > 0 && DDR_REVERSED_P (ddr))
3231           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3232         return false;
3233     }
3234
3235   if (dump_enabled_p ())
3236     dump_printf_loc (MSG_NOTE, vect_location,
3237                      "dependence distance between %T and %T is >= VF\n",
3238                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3239
3240   return true;
3241 }
3242
3243 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3244
3245 static void
3246 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3247 {
3248   dump_printf (dump_kind, "%s (%T) >= ",
3249                lower_bound.unsigned_p ? "unsigned" : "abs",
3250                lower_bound.expr);
3251   dump_dec (dump_kind, lower_bound.min_value);
3252 }
3253
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3256
3257 static void
3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3259                         poly_uint64 min_value)
3260 {
3261   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3262   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3263     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3264       {
3265         unsigned_p &= lower_bounds[i].unsigned_p;
3266         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3267         if (lower_bounds[i].unsigned_p != unsigned_p
3268             || maybe_lt (lower_bounds[i].min_value, min_value))
3269           {
3270             lower_bounds[i].unsigned_p = unsigned_p;
3271             lower_bounds[i].min_value = min_value;
3272             if (dump_enabled_p ())
3273               {
3274                 dump_printf_loc (MSG_NOTE, vect_location,
3275                                  "updating run-time check to ");
3276                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3277                 dump_printf (MSG_NOTE, "\n");
3278               }
3279           }
3280         return;
3281       }
3282
3283   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3284   if (dump_enabled_p ())
3285     {
3286       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3287       dump_lower_bound (MSG_NOTE, lower_bound);
3288       dump_printf (MSG_NOTE, "\n");
3289     }
3290   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3291 }
3292
3293 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3294    will span fewer than GAP bytes.  */
3295
3296 static bool
3297 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3298                   poly_int64 gap)
3299 {
3300   stmt_vec_info stmt_info = dr_info->stmt;
3301   HOST_WIDE_INT count
3302     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3303   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3304     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3305   return (estimated_poly_value (gap)
3306           <= count * vect_get_scalar_dr_size (dr_info));
3307 }
3308
3309 /* Return true if we know that there is no alias between DR_INFO_A and
3310    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3311    When returning true, set *LOWER_BOUND_OUT to this N.  */
3312
3313 static bool
3314 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3315                                 poly_uint64 *lower_bound_out)
3316 {
3317   /* Check that there is a constant gap of known sign between DR_A
3318      and DR_B.  */
3319   data_reference *dr_a = dr_info_a->dr;
3320   data_reference *dr_b = dr_info_b->dr;
3321   poly_int64 init_a, init_b;
3322   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3323       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3324       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3325       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3326       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3327       || !ordered_p (init_a, init_b))
3328     return false;
3329
3330   /* Sort DR_A and DR_B by the address they access.  */
3331   if (maybe_lt (init_b, init_a))
3332     {
3333       std::swap (init_a, init_b);
3334       std::swap (dr_info_a, dr_info_b);
3335       std::swap (dr_a, dr_b);
3336     }
3337
3338   /* If the two accesses could be dependent within a scalar iteration,
3339      make sure that we'd retain their order.  */
3340   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3341       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3342     return false;
3343
3344   /* There is no alias if abs (DR_STEP) is greater than or equal to
3345      the bytes spanned by the combination of the two accesses.  */
3346   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3347   return true;
3348 }
3349
3350 /* Function vect_prune_runtime_alias_test_list.
3351
3352    Prune a list of ddrs to be tested at run-time by versioning for alias.
3353    Merge several alias checks into one if possible.
3354    Return FALSE if resulting list of ddrs is longer then allowed by
3355    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3356
3357 opt_result
3358 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3359 {
3360   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3361   hash_set <tree_pair_hash> compared_objects;
3362
3363   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3364   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3365     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3366   vec<vec_object_pair> &check_unequal_addrs
3367     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3368   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3369   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3370
3371   ddr_p ddr;
3372   unsigned int i;
3373   tree length_factor;
3374
3375   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3376
3377   /* Step values are irrelevant for aliasing if the number of vector
3378      iterations is equal to the number of scalar iterations (which can
3379      happen for fully-SLP loops).  */
3380   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3381
3382   if (!ignore_step_p)
3383     {
3384       /* Convert the checks for nonzero steps into bound tests.  */
3385       tree value;
3386       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3387         vect_check_lower_bound (loop_vinfo, value, true, 1);
3388     }
3389
3390   if (may_alias_ddrs.is_empty ())
3391     return opt_result::success ();
3392
3393   comp_alias_ddrs.create (may_alias_ddrs.length ());
3394
3395   unsigned int loop_depth
3396     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3397                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3398
3399   /* First, we collect all data ref pairs for aliasing checks.  */
3400   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3401     {
3402       int comp_res;
3403       poly_uint64 lower_bound;
3404       tree segment_length_a, segment_length_b;
3405       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3406       unsigned int align_a, align_b;
3407
3408       /* Ignore the alias if the VF we chose ended up being no greater
3409          than the dependence distance.  */
3410       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3411         continue;
3412
3413       if (DDR_OBJECT_A (ddr))
3414         {
3415           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3416           if (!compared_objects.add (new_pair))
3417             {
3418               if (dump_enabled_p ())
3419                 dump_printf_loc (MSG_NOTE, vect_location,
3420                                  "checking that %T and %T"
3421                                  " have different addresses\n",
3422                                  new_pair.first, new_pair.second);
3423               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3424             }
3425           continue;
3426         }
3427
3428       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3429       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3430
3431       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3432       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3433
3434       /* Skip the pair if inter-iteration dependencies are irrelevant
3435          and intra-iteration dependencies are guaranteed to be honored.  */
3436       if (ignore_step_p
3437           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3438               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3439                                                  &lower_bound)))
3440         {
3441           if (dump_enabled_p ())
3442             dump_printf_loc (MSG_NOTE, vect_location,
3443                              "no need for alias check between "
3444                              "%T and %T when VF is 1\n",
3445                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3446           continue;
3447         }
3448
3449       /* See whether we can handle the alias using a bounds check on
3450          the step, and whether that's likely to be the best approach.
3451          (It might not be, for example, if the minimum step is much larger
3452          than the number of bytes handled by one vector iteration.)  */
3453       if (!ignore_step_p
3454           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3455           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3456                                              &lower_bound)
3457           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3458               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3459         {
3460           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3461           if (dump_enabled_p ())
3462             {
3463               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3464                                "%T and %T when the step %T is outside ",
3465                                DR_REF (dr_info_a->dr),
3466                                DR_REF (dr_info_b->dr),
3467                                DR_STEP (dr_info_a->dr));
3468               if (unsigned_p)
3469                 dump_printf (MSG_NOTE, "[0");
3470               else
3471                 {
3472                   dump_printf (MSG_NOTE, "(");
3473                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3474                 }
3475               dump_printf (MSG_NOTE, ", ");
3476               dump_dec (MSG_NOTE, lower_bound);
3477               dump_printf (MSG_NOTE, ")\n");
3478             }
3479           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3480                                   unsigned_p, lower_bound);
3481           continue;
3482         }
3483
3484       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3485       if (dr_group_first_a)
3486         {
3487           stmt_info_a = dr_group_first_a;
3488           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3489         }
3490
3491       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3492       if (dr_group_first_b)
3493         {
3494           stmt_info_b = dr_group_first_b;
3495           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3496         }
3497
3498       if (ignore_step_p)
3499         {
3500           segment_length_a = size_zero_node;
3501           segment_length_b = size_zero_node;
3502         }
3503       else
3504         {
3505           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3506                                 DR_STEP (dr_info_b->dr), 0))
3507             length_factor = scalar_loop_iters;
3508           else
3509             length_factor = size_int (vect_factor);
3510           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3511           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3512         }
3513       access_size_a = vect_vfa_access_size (dr_info_a);
3514       access_size_b = vect_vfa_access_size (dr_info_b);
3515       align_a = vect_vfa_align (dr_info_a);
3516       align_b = vect_vfa_align (dr_info_b);
3517
3518       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3519                                         DR_BASE_ADDRESS (dr_info_b->dr));
3520       if (comp_res == 0)
3521         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3522                                           DR_OFFSET (dr_info_b->dr));
3523
3524       /* See whether the alias is known at compilation time.  */
3525       if (comp_res == 0
3526           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3527           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3528           && poly_int_tree_p (segment_length_a)
3529           && poly_int_tree_p (segment_length_b))
3530         {
3531           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3532                                              segment_length_a,
3533                                              segment_length_b,
3534                                              access_size_a,
3535                                              access_size_b);
3536           if (res >= 0 && dump_enabled_p ())
3537             {
3538               dump_printf_loc (MSG_NOTE, vect_location,
3539                                "can tell at compile time that %T and %T",
3540                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3541               if (res == 0)
3542                 dump_printf (MSG_NOTE, " do not alias\n");
3543               else
3544                 dump_printf (MSG_NOTE, " alias\n");
3545             }
3546
3547           if (res == 0)
3548             continue;
3549
3550           if (res == 1)
3551             return opt_result::failure_at (stmt_info_b->stmt,
3552                                            "not vectorized:"
3553                                            " compilation time alias: %G%G",
3554                                            stmt_info_a->stmt,
3555                                            stmt_info_b->stmt);
3556         }
3557
3558       dr_with_seg_len_pair_t dr_with_seg_len_pair
3559         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3560                           access_size_a, align_a),
3561          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3562                           access_size_b, align_b));
3563
3564       /* Canonicalize pairs by sorting the two DR members.  */
3565       if (comp_res > 0)
3566         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3567
3568       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3569     }
3570
3571   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3572
3573   unsigned int count = (comp_alias_ddrs.length ()
3574                         + check_unequal_addrs.length ());
3575
3576   if (dump_enabled_p ())
3577     dump_printf_loc (MSG_NOTE, vect_location,
3578                      "improved number of alias checks from %d to %d\n",
3579                      may_alias_ddrs.length (), count);
3580   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3581     return opt_result::failure_at
3582       (vect_location,
3583        "number of versioning for alias "
3584        "run-time tests exceeds %d "
3585        "(--param vect-max-version-for-alias-checks)\n",
3586        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3587
3588   return opt_result::success ();
3589 }
3590
3591 /* Check whether we can use an internal function for a gather load
3592    or scatter store.  READ_P is true for loads and false for stores.
3593    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3594    the type of the memory elements being loaded or stored.  OFFSET_BITS
3595    is the number of bits in each scalar offset and OFFSET_SIGN is the
3596    sign of the offset.  SCALE is the amount by which the offset should
3597    be multiplied *after* it has been converted to address width.
3598
3599    Return true if the function is supported, storing the function
3600    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3601
3602 bool
3603 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3604                           tree memory_type, unsigned int offset_bits,
3605                           signop offset_sign, int scale,
3606                           internal_fn *ifn_out, tree *element_type_out)
3607 {
3608   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3609   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3610   if (offset_bits > element_bits)
3611     /* Internal functions require the offset to be the same width as
3612        the vector elements.  We can extend narrower offsets, but it isn't
3613        safe to truncate wider offsets.  */
3614     return false;
3615
3616   if (element_bits != memory_bits)
3617     /* For now the vector elements must be the same width as the
3618        memory elements.  */
3619     return false;
3620
3621   /* Work out which function we need.  */
3622   internal_fn ifn;
3623   if (read_p)
3624     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3625   else
3626     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3627
3628   /* Test whether the target supports this combination.  */
3629   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3630                                                offset_sign, scale))
3631     return false;
3632
3633   *ifn_out = ifn;
3634   *element_type_out = TREE_TYPE (vectype);
3635   return true;
3636 }
3637
3638 /* STMT_INFO is a call to an internal gather load or scatter store function.
3639    Describe the operation in INFO.  */
3640
3641 static void
3642 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3643                                    gather_scatter_info *info)
3644 {
3645   gcall *call = as_a <gcall *> (stmt_info->stmt);
3646   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3647   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3648
3649   info->ifn = gimple_call_internal_fn (call);
3650   info->decl = NULL_TREE;
3651   info->base = gimple_call_arg (call, 0);
3652   info->offset = gimple_call_arg (call, 1);
3653   info->offset_dt = vect_unknown_def_type;
3654   info->offset_vectype = NULL_TREE;
3655   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3656   info->element_type = TREE_TYPE (vectype);
3657   info->memory_type = TREE_TYPE (DR_REF (dr));
3658 }
3659
3660 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3661    gather load or scatter store.  Describe the operation in *INFO if so.  */
3662
3663 bool
3664 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3665                            gather_scatter_info *info)
3666 {
3667   HOST_WIDE_INT scale = 1;
3668   poly_int64 pbitpos, pbitsize;
3669   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3670   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3671   tree offtype = NULL_TREE;
3672   tree decl = NULL_TREE, base, off;
3673   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3674   tree memory_type = TREE_TYPE (DR_REF (dr));
3675   machine_mode pmode;
3676   int punsignedp, reversep, pvolatilep = 0;
3677   internal_fn ifn;
3678   tree element_type;
3679   bool masked_p = false;
3680
3681   /* See whether this is already a call to a gather/scatter internal function.
3682      If not, see whether it's a masked load or store.  */
3683   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3684   if (call && gimple_call_internal_p (call))
3685     {
3686       ifn = gimple_call_internal_fn (call);
3687       if (internal_gather_scatter_fn_p (ifn))
3688         {
3689           vect_describe_gather_scatter_call (stmt_info, info);
3690           return true;
3691         }
3692       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3693     }
3694
3695   /* True if we should aim to use internal functions rather than
3696      built-in functions.  */
3697   bool use_ifn_p = (DR_IS_READ (dr)
3698                     ? supports_vec_gather_load_p ()
3699                     : supports_vec_scatter_store_p ());
3700
3701   base = DR_REF (dr);
3702   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3703      see if we can use the def stmt of the address.  */
3704   if (masked_p
3705       && TREE_CODE (base) == MEM_REF
3706       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3707       && integer_zerop (TREE_OPERAND (base, 1))
3708       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3709     {
3710       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3711       if (is_gimple_assign (def_stmt)
3712           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3713         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3714     }
3715
3716   /* The gather and scatter builtins need address of the form
3717      loop_invariant + vector * {1, 2, 4, 8}
3718      or
3719      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3720      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3721      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3722      multiplications and additions in it.  To get a vector, we need
3723      a single SSA_NAME that will be defined in the loop and will
3724      contain everything that is not loop invariant and that can be
3725      vectorized.  The following code attempts to find such a preexistng
3726      SSA_NAME OFF and put the loop invariants into a tree BASE
3727      that can be gimplified before the loop.  */
3728   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3729                               &punsignedp, &reversep, &pvolatilep);
3730   if (reversep)
3731     return false;
3732
3733   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3734
3735   if (TREE_CODE (base) == MEM_REF)
3736     {
3737       if (!integer_zerop (TREE_OPERAND (base, 1)))
3738         {
3739           if (off == NULL_TREE)
3740             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3741           else
3742             off = size_binop (PLUS_EXPR, off,
3743                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3744         }
3745       base = TREE_OPERAND (base, 0);
3746     }
3747   else
3748     base = build_fold_addr_expr (base);
3749
3750   if (off == NULL_TREE)
3751     off = size_zero_node;
3752
3753   /* If base is not loop invariant, either off is 0, then we start with just
3754      the constant offset in the loop invariant BASE and continue with base
3755      as OFF, otherwise give up.
3756      We could handle that case by gimplifying the addition of base + off
3757      into some SSA_NAME and use that as off, but for now punt.  */
3758   if (!expr_invariant_in_loop_p (loop, base))
3759     {
3760       if (!integer_zerop (off))
3761         return false;
3762       off = base;
3763       base = size_int (pbytepos);
3764     }
3765   /* Otherwise put base + constant offset into the loop invariant BASE
3766      and continue with OFF.  */
3767   else
3768     {
3769       base = fold_convert (sizetype, base);
3770       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3771     }
3772
3773   /* OFF at this point may be either a SSA_NAME or some tree expression
3774      from get_inner_reference.  Try to peel off loop invariants from it
3775      into BASE as long as possible.  */
3776   STRIP_NOPS (off);
3777   while (offtype == NULL_TREE)
3778     {
3779       enum tree_code code;
3780       tree op0, op1, add = NULL_TREE;
3781
3782       if (TREE_CODE (off) == SSA_NAME)
3783         {
3784           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3785
3786           if (expr_invariant_in_loop_p (loop, off))
3787             return false;
3788
3789           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3790             break;
3791
3792           op0 = gimple_assign_rhs1 (def_stmt);
3793           code = gimple_assign_rhs_code (def_stmt);
3794           op1 = gimple_assign_rhs2 (def_stmt);
3795         }
3796       else
3797         {
3798           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3799             return false;
3800           code = TREE_CODE (off);
3801           extract_ops_from_tree (off, &code, &op0, &op1);
3802         }
3803       switch (code)
3804         {
3805         case POINTER_PLUS_EXPR:
3806         case PLUS_EXPR:
3807           if (expr_invariant_in_loop_p (loop, op0))
3808             {
3809               add = op0;
3810               off = op1;
3811             do_add:
3812               add = fold_convert (sizetype, add);
3813               if (scale != 1)
3814                 add = size_binop (MULT_EXPR, add, size_int (scale));
3815               base = size_binop (PLUS_EXPR, base, add);
3816               continue;
3817             }
3818           if (expr_invariant_in_loop_p (loop, op1))
3819             {
3820               add = op1;
3821               off = op0;
3822               goto do_add;
3823             }
3824           break;
3825         case MINUS_EXPR:
3826           if (expr_invariant_in_loop_p (loop, op1))
3827             {
3828               add = fold_convert (sizetype, op1);
3829               add = size_binop (MINUS_EXPR, size_zero_node, add);
3830               off = op0;
3831               goto do_add;
3832             }
3833           break;
3834         case MULT_EXPR:
3835           if (scale == 1 && tree_fits_shwi_p (op1))
3836             {
3837               int new_scale = tree_to_shwi (op1);
3838               /* Only treat this as a scaling operation if the target
3839                  supports it.  */
3840               if (use_ifn_p
3841                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3842                                                 vectype, memory_type, 1,
3843                                                 TYPE_SIGN (TREE_TYPE (op0)),
3844                                                 new_scale, &ifn,
3845                                                 &element_type))
3846                 break;
3847               scale = new_scale;
3848               off = op0;
3849               continue;
3850             }
3851           break;
3852         case SSA_NAME:
3853           off = op0;
3854           continue;
3855         CASE_CONVERT:
3856           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3857               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3858             break;
3859           if (TYPE_PRECISION (TREE_TYPE (op0))
3860               == TYPE_PRECISION (TREE_TYPE (off)))
3861             {
3862               off = op0;
3863               continue;
3864             }
3865
3866           /* The internal functions need the offset to be the same width
3867              as the elements of VECTYPE.  Don't include operations that
3868              cast the offset from that width to a different width.  */
3869           if (use_ifn_p
3870               && (int_size_in_bytes (TREE_TYPE (vectype))
3871                   == int_size_in_bytes (TREE_TYPE (off))))
3872             break;
3873
3874           if (TYPE_PRECISION (TREE_TYPE (op0))
3875               < TYPE_PRECISION (TREE_TYPE (off)))
3876             {
3877               off = op0;
3878               offtype = TREE_TYPE (off);
3879               STRIP_NOPS (off);
3880               continue;
3881             }
3882           break;
3883         default:
3884           break;
3885         }
3886       break;
3887     }
3888
3889   /* If at the end OFF still isn't a SSA_NAME or isn't
3890      defined in the loop, punt.  */
3891   if (TREE_CODE (off) != SSA_NAME
3892       || expr_invariant_in_loop_p (loop, off))
3893     return false;
3894
3895   if (offtype == NULL_TREE)
3896     offtype = TREE_TYPE (off);
3897
3898   if (use_ifn_p)
3899     {
3900       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3901                                      memory_type, TYPE_PRECISION (offtype),
3902                                      TYPE_SIGN (offtype), scale, &ifn,
3903                                      &element_type))
3904         return false;
3905     }
3906   else
3907     {
3908       if (DR_IS_READ (dr))
3909         {
3910           if (targetm.vectorize.builtin_gather)
3911             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3912         }
3913       else
3914         {
3915           if (targetm.vectorize.builtin_scatter)
3916             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3917         }
3918
3919       if (!decl)
3920         return false;
3921
3922       ifn = IFN_LAST;
3923       element_type = TREE_TYPE (vectype);
3924     }
3925
3926   info->ifn = ifn;
3927   info->decl = decl;
3928   info->base = base;
3929   info->offset = off;
3930   info->offset_dt = vect_unknown_def_type;
3931   info->offset_vectype = NULL_TREE;
3932   info->scale = scale;
3933   info->element_type = element_type;
3934   info->memory_type = memory_type;
3935   return true;
3936 }
3937
3938 /* Find the data references in STMT, analyze them with respect to LOOP and
3939    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3940    be handled.  */
3941
3942 opt_result
3943 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3944                                vec<data_reference_p> *datarefs)
3945 {
3946   /* We can ignore clobbers for dataref analysis - they are removed during
3947      loop vectorization and BB vectorization checks dependences with a
3948      stmt walk.  */
3949   if (gimple_clobber_p (stmt))
3950     return opt_result::success ();
3951
3952   if (gimple_has_volatile_ops (stmt))
3953     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3954                                    stmt);
3955
3956   if (stmt_can_throw_internal (cfun, stmt))
3957     return opt_result::failure_at (stmt,
3958                                    "not vectorized:"
3959                                    " statement can throw an exception: %G",
3960                                    stmt);
3961
3962   auto_vec<data_reference_p, 2> refs;
3963   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3964   if (!res)
3965     return res;
3966
3967   if (refs.is_empty ())
3968     return opt_result::success ();
3969
3970   if (refs.length () > 1)
3971     return opt_result::failure_at (stmt,
3972                                    "not vectorized:"
3973                                    " more than one data ref in stmt: %G", stmt);
3974
3975   if (gcall *call = dyn_cast <gcall *> (stmt))
3976     if (!gimple_call_internal_p (call)
3977         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3978             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3979       return opt_result::failure_at (stmt,
3980                                      "not vectorized: dr in a call %G", stmt);
3981
3982   data_reference_p dr = refs.pop ();
3983   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3984       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3985     return opt_result::failure_at (stmt,
3986                                    "not vectorized:"
3987                                    " statement is bitfield access %G", stmt);
3988
3989   if (DR_BASE_ADDRESS (dr)
3990       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3991     return opt_result::failure_at (stmt,
3992                                    "not vectorized:"
3993                                    " base addr of dr is a constant\n");
3994
3995   /* Check whether this may be a SIMD lane access and adjust the
3996      DR to make it easier for us to handle it.  */
3997   if (loop
3998       && loop->simduid
3999       && (!DR_BASE_ADDRESS (dr)
4000           || !DR_OFFSET (dr)
4001           || !DR_INIT (dr)
4002           || !DR_STEP (dr)))
4003     {
4004       struct data_reference *newdr
4005         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4006                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4007       if (DR_BASE_ADDRESS (newdr)
4008           && DR_OFFSET (newdr)
4009           && DR_INIT (newdr)
4010           && DR_STEP (newdr)
4011           && integer_zerop (DR_STEP (newdr)))
4012         {
4013           tree off = DR_OFFSET (newdr);
4014           STRIP_NOPS (off);
4015           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4016               && TREE_CODE (off) == MULT_EXPR
4017               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4018             {
4019               tree step = TREE_OPERAND (off, 1);
4020               off = TREE_OPERAND (off, 0);
4021               STRIP_NOPS (off);
4022               if (CONVERT_EXPR_P (off)
4023                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4024                       < TYPE_PRECISION (TREE_TYPE (off))))
4025                 off = TREE_OPERAND (off, 0);
4026               if (TREE_CODE (off) == SSA_NAME)
4027                 {
4028                   gimple *def = SSA_NAME_DEF_STMT (off);
4029                   tree reft = TREE_TYPE (DR_REF (newdr));
4030                   if (is_gimple_call (def)
4031                       && gimple_call_internal_p (def)
4032                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4033                     {
4034                       tree arg = gimple_call_arg (def, 0);
4035                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4036                       arg = SSA_NAME_VAR (arg);
4037                       if (arg == loop->simduid
4038                           /* For now.  */
4039                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4040                         {
4041                           DR_OFFSET (newdr) = ssize_int (0);
4042                           DR_STEP (newdr) = step;
4043                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4044                           DR_STEP_ALIGNMENT (newdr)
4045                             = highest_pow2_factor (step);
4046                           /* Mark as simd-lane access.  */
4047                           newdr->aux = (void *)-1;
4048                           free_data_ref (dr);
4049                           datarefs->safe_push (newdr);
4050                           return opt_result::success ();
4051                         }
4052                     }
4053                 }
4054             }
4055         }
4056       free_data_ref (newdr);
4057     }
4058
4059   datarefs->safe_push (dr);
4060   return opt_result::success ();
4061 }
4062
4063 /* Function vect_analyze_data_refs.
4064
4065   Find all the data references in the loop or basic block.
4066
4067    The general structure of the analysis of data refs in the vectorizer is as
4068    follows:
4069    1- vect_analyze_data_refs(loop/bb): call
4070       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4071       in the loop/bb and their dependences.
4072    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4073    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4074    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4075
4076 */
4077
4078 opt_result
4079 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4080 {
4081   struct loop *loop = NULL;
4082   unsigned int i;
4083   struct data_reference *dr;
4084   tree scalar_type;
4085
4086   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4087
4088   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4089     loop = LOOP_VINFO_LOOP (loop_vinfo);
4090
4091   /* Go through the data-refs, check that the analysis succeeded.  Update
4092      pointer from stmt_vec_info struct to DR and vectype.  */
4093
4094   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4095   FOR_EACH_VEC_ELT (datarefs, i, dr)
4096     {
4097       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4098       poly_uint64 vf;
4099
4100       gcc_assert (DR_REF (dr));
4101       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4102       gcc_assert (!stmt_info->dr_aux.dr);
4103       stmt_info->dr_aux.dr = dr;
4104       stmt_info->dr_aux.stmt = stmt_info;
4105
4106       /* Check that analysis of the data-ref succeeded.  */
4107       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4108           || !DR_STEP (dr))
4109         {
4110           bool maybe_gather
4111             = DR_IS_READ (dr)
4112               && !TREE_THIS_VOLATILE (DR_REF (dr))
4113               && (targetm.vectorize.builtin_gather != NULL
4114                   || supports_vec_gather_load_p ());
4115           bool maybe_scatter
4116             = DR_IS_WRITE (dr)
4117               && !TREE_THIS_VOLATILE (DR_REF (dr))
4118               && (targetm.vectorize.builtin_scatter != NULL
4119                   || supports_vec_scatter_store_p ());
4120
4121           /* If target supports vector gather loads or scatter stores,
4122              see if they can't be used.  */
4123           if (is_a <loop_vec_info> (vinfo)
4124               && !nested_in_vect_loop_p (loop, stmt_info))
4125             {
4126               if (maybe_gather || maybe_scatter)
4127                 {
4128                   if (maybe_gather)
4129                     gatherscatter = GATHER;
4130                   else
4131                     gatherscatter = SCATTER;
4132                 }
4133             }
4134
4135           if (gatherscatter == SG_NONE)
4136             {
4137               if (dump_enabled_p ())
4138                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4139                                  "not vectorized: data ref analysis "
4140                                  "failed %G", stmt_info->stmt);
4141               if (is_a <bb_vec_info> (vinfo))
4142                 {
4143                   /* In BB vectorization the ref can still participate
4144                      in dependence analysis, we just can't vectorize it.  */
4145                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4146                   continue;
4147                 }
4148               return opt_result::failure_at (stmt_info->stmt,
4149                                              "not vectorized:"
4150                                              " data ref analysis failed: %G",
4151                                              stmt_info->stmt);
4152             }
4153         }
4154
4155       /* See if this was detected as SIMD lane access.  */
4156       if (dr->aux == (void *)-1)
4157         {
4158           if (nested_in_vect_loop_p (loop, stmt_info))
4159             return opt_result::failure_at (stmt_info->stmt,
4160                                            "not vectorized:"
4161                                            " data ref analysis failed: %G",
4162                                            stmt_info->stmt);
4163           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4164         }
4165
4166       tree base = get_base_address (DR_REF (dr));
4167       if (base && VAR_P (base) && DECL_NONALIASED (base))
4168         {
4169           if (dump_enabled_p ())
4170             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4171                              "not vectorized: base object not addressable "
4172                              "for stmt: %G", stmt_info->stmt);
4173           if (is_a <bb_vec_info> (vinfo))
4174             {
4175               /* In BB vectorization the ref can still participate
4176                  in dependence analysis, we just can't vectorize it.  */
4177               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4178               continue;
4179             }
4180           return opt_result::failure_at (stmt_info->stmt,
4181                                          "not vectorized: base object not"
4182                                          " addressable for stmt: %G",
4183                                          stmt_info->stmt);
4184         }
4185
4186       if (is_a <loop_vec_info> (vinfo)
4187           && DR_STEP (dr)
4188           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4189         {
4190           if (nested_in_vect_loop_p (loop, stmt_info))
4191             return opt_result::failure_at (stmt_info->stmt,
4192                                            "not vectorized:"
4193                                            "not suitable for strided load %G",
4194                                            stmt_info->stmt);
4195           STMT_VINFO_STRIDED_P (stmt_info) = true;
4196         }
4197
4198       /* Update DR field in stmt_vec_info struct.  */
4199
4200       /* If the dataref is in an inner-loop of the loop that is considered for
4201          for vectorization, we also want to analyze the access relative to
4202          the outer-loop (DR contains information only relative to the
4203          inner-most enclosing loop).  We do that by building a reference to the
4204          first location accessed by the inner-loop, and analyze it relative to
4205          the outer-loop.  */
4206       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4207         {
4208           /* Build a reference to the first location accessed by the
4209              inner loop: *(BASE + INIT + OFFSET).  By construction,
4210              this address must be invariant in the inner loop, so we
4211              can consider it as being used in the outer loop.  */
4212           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4213           tree offset = unshare_expr (DR_OFFSET (dr));
4214           tree init = unshare_expr (DR_INIT (dr));
4215           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4216                                           init, offset);
4217           tree init_addr = fold_build_pointer_plus (base, init_offset);
4218           tree init_ref = build_fold_indirect_ref (init_addr);
4219
4220           if (dump_enabled_p ())
4221             dump_printf_loc (MSG_NOTE, vect_location,
4222                              "analyze in outer loop: %T\n", init_ref);
4223
4224           opt_result res
4225             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4226                                     init_ref, loop, stmt_info->stmt);
4227           if (!res)
4228             /* dr_analyze_innermost already explained the failure.  */
4229             return res;
4230
4231           if (dump_enabled_p ())
4232             dump_printf_loc (MSG_NOTE, vect_location,
4233                              "\touter base_address: %T\n"
4234                              "\touter offset from base address: %T\n"
4235                              "\touter constant offset from base address: %T\n"
4236                              "\touter step: %T\n"
4237                              "\touter base alignment: %d\n\n"
4238                              "\touter base misalignment: %d\n"
4239                              "\touter offset alignment: %d\n"
4240                              "\touter step alignment: %d\n",
4241                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4242                              STMT_VINFO_DR_OFFSET (stmt_info),
4243                              STMT_VINFO_DR_INIT (stmt_info),
4244                              STMT_VINFO_DR_STEP (stmt_info),
4245                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4246                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4247                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4248                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4249         }
4250
4251       /* Set vectype for STMT.  */
4252       scalar_type = TREE_TYPE (DR_REF (dr));
4253       STMT_VINFO_VECTYPE (stmt_info)
4254         = get_vectype_for_scalar_type (scalar_type);
4255       if (!STMT_VINFO_VECTYPE (stmt_info))
4256         {
4257           if (dump_enabled_p ())
4258             {
4259               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4260                                "not vectorized: no vectype for stmt: %G",
4261                                stmt_info->stmt);
4262               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4263               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4264                                  scalar_type);
4265               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4266             }
4267
4268           if (is_a <bb_vec_info> (vinfo))
4269             {
4270               /* No vector type is fine, the ref can still participate
4271                  in dependence analysis, we just can't vectorize it.  */
4272               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4273               continue;
4274             }
4275           return opt_result::failure_at (stmt_info->stmt,
4276                                          "not vectorized:"
4277                                          " no vectype for stmt: %G"
4278                                          " scalar_type: %T\n",
4279                                          stmt_info->stmt, scalar_type);
4280         }
4281       else
4282         {
4283           if (dump_enabled_p ())
4284             dump_printf_loc (MSG_NOTE, vect_location,
4285                              "got vectype for stmt: %G%T\n",
4286                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4287         }
4288
4289       /* Adjust the minimal vectorization factor according to the
4290          vector type.  */
4291       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4292       *min_vf = upper_bound (*min_vf, vf);
4293
4294       if (gatherscatter != SG_NONE)
4295         {
4296           gather_scatter_info gs_info;
4297           if (!vect_check_gather_scatter (stmt_info,
4298                                           as_a <loop_vec_info> (vinfo),
4299                                           &gs_info)
4300               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4301             return opt_result::failure_at
4302               (stmt_info->stmt,
4303                (gatherscatter == GATHER) ?
4304                "not vectorized: not suitable for gather load %G" :
4305                "not vectorized: not suitable for scatter store %G",
4306                stmt_info->stmt);
4307           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4308         }
4309     }
4310
4311   /* We used to stop processing and prune the list here.  Verify we no
4312      longer need to.  */
4313   gcc_assert (i == datarefs.length ());
4314
4315   return opt_result::success ();
4316 }
4317
4318
4319 /* Function vect_get_new_vect_var.
4320
4321    Returns a name for a new variable.  The current naming scheme appends the
4322    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4323    the name of vectorizer generated variables, and appends that to NAME if
4324    provided.  */
4325
4326 tree
4327 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4328 {
4329   const char *prefix;
4330   tree new_vect_var;
4331
4332   switch (var_kind)
4333   {
4334   case vect_simple_var:
4335     prefix = "vect";
4336     break;
4337   case vect_scalar_var:
4338     prefix = "stmp";
4339     break;
4340   case vect_mask_var:
4341     prefix = "mask";
4342     break;
4343   case vect_pointer_var:
4344     prefix = "vectp";
4345     break;
4346   default:
4347     gcc_unreachable ();
4348   }
4349
4350   if (name)
4351     {
4352       char* tmp = concat (prefix, "_", name, NULL);
4353       new_vect_var = create_tmp_reg (type, tmp);
4354       free (tmp);
4355     }
4356   else
4357     new_vect_var = create_tmp_reg (type, prefix);
4358
4359   return new_vect_var;
4360 }
4361
4362 /* Like vect_get_new_vect_var but return an SSA name.  */
4363
4364 tree
4365 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4366 {
4367   const char *prefix;
4368   tree new_vect_var;
4369
4370   switch (var_kind)
4371   {
4372   case vect_simple_var:
4373     prefix = "vect";
4374     break;
4375   case vect_scalar_var:
4376     prefix = "stmp";
4377     break;
4378   case vect_pointer_var:
4379     prefix = "vectp";
4380     break;
4381   default:
4382     gcc_unreachable ();
4383   }
4384
4385   if (name)
4386     {
4387       char* tmp = concat (prefix, "_", name, NULL);
4388       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4389       free (tmp);
4390     }
4391   else
4392     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4393
4394   return new_vect_var;
4395 }
4396
4397 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4398
4399 static void
4400 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4401 {
4402   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4403   int misalign = DR_MISALIGNMENT (dr_info);
4404   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4405     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4406   else
4407     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4408                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4409                             misalign);
4410 }
4411
4412 /* Function vect_create_addr_base_for_vector_ref.
4413
4414    Create an expression that computes the address of the first memory location
4415    that will be accessed for a data reference.
4416
4417    Input:
4418    STMT_INFO: The statement containing the data reference.
4419    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4420    OFFSET: Optional. If supplied, it is be added to the initial address.
4421    LOOP:    Specify relative to which loop-nest should the address be computed.
4422             For example, when the dataref is in an inner-loop nested in an
4423             outer-loop that is now being vectorized, LOOP can be either the
4424             outer-loop, or the inner-loop.  The first memory location accessed
4425             by the following dataref ('in' points to short):
4426
4427                 for (i=0; i<N; i++)
4428                    for (j=0; j<M; j++)
4429                      s += in[i+j]
4430
4431             is as follows:
4432             if LOOP=i_loop:     &in             (relative to i_loop)
4433             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4434    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4435             initial address.  Unlike OFFSET, which is number of elements to
4436             be added, BYTE_OFFSET is measured in bytes.
4437
4438    Output:
4439    1. Return an SSA_NAME whose value is the address of the memory location of
4440       the first vector of the data reference.
4441    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4442       these statement(s) which define the returned SSA_NAME.
4443
4444    FORNOW: We are only handling array accesses with step 1.  */
4445
4446 tree
4447 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4448                                       gimple_seq *new_stmt_list,
4449                                       tree offset,
4450                                       tree byte_offset)
4451 {
4452   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4453   struct data_reference *dr = dr_info->dr;
4454   const char *base_name;
4455   tree addr_base;
4456   tree dest;
4457   gimple_seq seq = NULL;
4458   tree vect_ptr_type;
4459   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4461   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4462
4463   tree data_ref_base = unshare_expr (drb->base_address);
4464   tree base_offset = unshare_expr (drb->offset);
4465   tree init = unshare_expr (drb->init);
4466
4467   if (loop_vinfo)
4468     base_name = get_name (data_ref_base);
4469   else
4470     {
4471       base_offset = ssize_int (0);
4472       init = ssize_int (0);
4473       base_name = get_name (DR_REF (dr));
4474     }
4475
4476   /* Create base_offset */
4477   base_offset = size_binop (PLUS_EXPR,
4478                             fold_convert (sizetype, base_offset),
4479                             fold_convert (sizetype, init));
4480
4481   if (offset)
4482     {
4483       offset = fold_build2 (MULT_EXPR, sizetype,
4484                             fold_convert (sizetype, offset), step);
4485       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4486                                  base_offset, offset);
4487     }
4488   if (byte_offset)
4489     {
4490       byte_offset = fold_convert (sizetype, byte_offset);
4491       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4492                                  base_offset, byte_offset);
4493     }
4494
4495   /* base + base_offset */
4496   if (loop_vinfo)
4497     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4498   else
4499     {
4500       addr_base = build1 (ADDR_EXPR,
4501                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4502                           unshare_expr (DR_REF (dr)));
4503     }
4504
4505   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4506   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4507   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4508   gimple_seq_add_seq (new_stmt_list, seq);
4509
4510   if (DR_PTR_INFO (dr)
4511       && TREE_CODE (addr_base) == SSA_NAME
4512       && !SSA_NAME_PTR_INFO (addr_base))
4513     {
4514       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4515       if (offset || byte_offset)
4516         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4517     }
4518
4519   if (dump_enabled_p ())
4520     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4521
4522   return addr_base;
4523 }
4524
4525
4526 /* Function vect_create_data_ref_ptr.
4527
4528    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4529    location accessed in the loop by STMT_INFO, along with the def-use update
4530    chain to appropriately advance the pointer through the loop iterations.
4531    Also set aliasing information for the pointer.  This pointer is used by
4532    the callers to this function to create a memory reference expression for
4533    vector load/store access.
4534
4535    Input:
4536    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4537          GIMPLE_ASSIGN <name, data-ref> or
4538          GIMPLE_ASSIGN <data-ref, name>.
4539    2. AGGR_TYPE: the type of the reference, which should be either a vector
4540         or an array.
4541    3. AT_LOOP: the loop where the vector memref is to be created.
4542    4. OFFSET (optional): an offset to be added to the initial address accessed
4543         by the data-ref in STMT_INFO.
4544    5. BSI: location where the new stmts are to be placed if there is no loop
4545    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4546         pointing to the initial address.
4547    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4548         to the initial address accessed by the data-ref in STMT_INFO.  This is
4549         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4550         in bytes.
4551    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4552         to the IV during each iteration of the loop.  NULL says to move
4553         by one copy of AGGR_TYPE up or down, depending on the step of the
4554         data reference.
4555
4556    Output:
4557    1. Declare a new ptr to vector_type, and have it point to the base of the
4558       data reference (initial addressed accessed by the data reference).
4559       For example, for vector of type V8HI, the following code is generated:
4560
4561       v8hi *ap;
4562       ap = (v8hi *)initial_address;
4563
4564       if OFFSET is not supplied:
4565          initial_address = &a[init];
4566       if OFFSET is supplied:
4567          initial_address = &a[init + OFFSET];
4568       if BYTE_OFFSET is supplied:
4569          initial_address = &a[init] + BYTE_OFFSET;
4570
4571       Return the initial_address in INITIAL_ADDRESS.
4572
4573    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4574       update the pointer in each iteration of the loop.
4575
4576       Return the increment stmt that updates the pointer in PTR_INCR.
4577
4578    3. Return the pointer.  */
4579
4580 tree
4581 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4582                           struct loop *at_loop, tree offset,
4583                           tree *initial_address, gimple_stmt_iterator *gsi,
4584                           gimple **ptr_incr, bool only_init,
4585                           tree byte_offset, tree iv_step)
4586 {
4587   const char *base_name;
4588   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4589   struct loop *loop = NULL;
4590   bool nested_in_vect_loop = false;
4591   struct loop *containing_loop = NULL;
4592   tree aggr_ptr_type;
4593   tree aggr_ptr;
4594   tree new_temp;
4595   gimple_seq new_stmt_list = NULL;
4596   edge pe = NULL;
4597   basic_block new_bb;
4598   tree aggr_ptr_init;
4599   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4600   struct data_reference *dr = dr_info->dr;
4601   tree aptr;
4602   gimple_stmt_iterator incr_gsi;
4603   bool insert_after;
4604   tree indx_before_incr, indx_after_incr;
4605   gimple *incr;
4606   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4607
4608   gcc_assert (iv_step != NULL_TREE
4609               || TREE_CODE (aggr_type) == ARRAY_TYPE
4610               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4611
4612   if (loop_vinfo)
4613     {
4614       loop = LOOP_VINFO_LOOP (loop_vinfo);
4615       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4616       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4617       pe = loop_preheader_edge (loop);
4618     }
4619   else
4620     {
4621       gcc_assert (bb_vinfo);
4622       only_init = true;
4623       *ptr_incr = NULL;
4624     }
4625
4626   /* Create an expression for the first address accessed by this load
4627      in LOOP.  */
4628   base_name = get_name (DR_BASE_ADDRESS (dr));
4629
4630   if (dump_enabled_p ())
4631     {
4632       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4633       dump_printf_loc (MSG_NOTE, vect_location,
4634                        "create %s-pointer variable to type: %T",
4635                        get_tree_code_name (TREE_CODE (aggr_type)),
4636                        aggr_type);
4637       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4638         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4639       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4640         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4641       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4642         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4643       else
4644         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4645       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4646     }
4647
4648   /* (1) Create the new aggregate-pointer variable.
4649      Vector and array types inherit the alias set of their component
4650      type by default so we need to use a ref-all pointer if the data
4651      reference does not conflict with the created aggregated data
4652      reference because it is not addressable.  */
4653   bool need_ref_all = false;
4654   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4655                               get_alias_set (DR_REF (dr))))
4656     need_ref_all = true;
4657   /* Likewise for any of the data references in the stmt group.  */
4658   else if (DR_GROUP_SIZE (stmt_info) > 1)
4659     {
4660       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4661       do
4662         {
4663           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4664           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4665                                       get_alias_set (DR_REF (sdr))))
4666             {
4667               need_ref_all = true;
4668               break;
4669             }
4670           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4671         }
4672       while (sinfo);
4673     }
4674   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4675                                                need_ref_all);
4676   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4677
4678
4679   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4680      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4681      def-use update cycles for the pointer: one relative to the outer-loop
4682      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4683      to the inner-loop (which is the inner-most loop containing the dataref),
4684      and this is done be step (5) below.
4685
4686      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4687      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4688      redundant.  Steps (3),(4) create the following:
4689
4690         vp0 = &base_addr;
4691         LOOP:   vp1 = phi(vp0,vp2)
4692                 ...
4693                 ...
4694                 vp2 = vp1 + step
4695                 goto LOOP
4696
4697      If there is an inner-loop nested in loop, then step (5) will also be
4698      applied, and an additional update in the inner-loop will be created:
4699
4700         vp0 = &base_addr;
4701         LOOP:   vp1 = phi(vp0,vp2)
4702                 ...
4703         inner:     vp3 = phi(vp1,vp4)
4704                    vp4 = vp3 + inner_step
4705                    if () goto inner
4706                 ...
4707                 vp2 = vp1 + step
4708                 if () goto LOOP   */
4709
4710   /* (2) Calculate the initial address of the aggregate-pointer, and set
4711      the aggregate-pointer to point to it before the loop.  */
4712
4713   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4714
4715   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4716                                                    offset, byte_offset);
4717   if (new_stmt_list)
4718     {
4719       if (pe)
4720         {
4721           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4722           gcc_assert (!new_bb);
4723         }
4724       else
4725         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4726     }
4727
4728   *initial_address = new_temp;
4729   aggr_ptr_init = new_temp;
4730
4731   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4732      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4733      inner-loop nested in LOOP (during outer-loop vectorization).  */
4734
4735   /* No update in loop is required.  */
4736   if (only_init && (!loop_vinfo || at_loop == loop))
4737     aptr = aggr_ptr_init;
4738   else
4739     {
4740       /* Accesses to invariant addresses should be handled specially
4741          by the caller.  */
4742       tree step = vect_dr_behavior (dr_info)->step;
4743       gcc_assert (!integer_zerop (step));
4744
4745       if (iv_step == NULL_TREE)
4746         {
4747           /* The step of the aggregate pointer is the type size,
4748              negated for downward accesses.  */
4749           iv_step = TYPE_SIZE_UNIT (aggr_type);
4750           if (tree_int_cst_sgn (step) == -1)
4751             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4752         }
4753
4754       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4755
4756       create_iv (aggr_ptr_init,
4757                  fold_convert (aggr_ptr_type, iv_step),
4758                  aggr_ptr, loop, &incr_gsi, insert_after,
4759                  &indx_before_incr, &indx_after_incr);
4760       incr = gsi_stmt (incr_gsi);
4761       loop_vinfo->add_stmt (incr);
4762
4763       /* Copy the points-to information if it exists. */
4764       if (DR_PTR_INFO (dr))
4765         {
4766           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4767           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4768         }
4769       if (ptr_incr)
4770         *ptr_incr = incr;
4771
4772       aptr = indx_before_incr;
4773     }
4774
4775   if (!nested_in_vect_loop || only_init)
4776     return aptr;
4777
4778
4779   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4780      nested in LOOP, if exists.  */
4781
4782   gcc_assert (nested_in_vect_loop);
4783   if (!only_init)
4784     {
4785       standard_iv_increment_position (containing_loop, &incr_gsi,
4786                                       &insert_after);
4787       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4788                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4789                  &indx_after_incr);
4790       incr = gsi_stmt (incr_gsi);
4791       loop_vinfo->add_stmt (incr);
4792
4793       /* Copy the points-to information if it exists. */
4794       if (DR_PTR_INFO (dr))
4795         {
4796           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4797           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4798         }
4799       if (ptr_incr)
4800         *ptr_incr = incr;
4801
4802       return indx_before_incr;
4803     }
4804   else
4805     gcc_unreachable ();
4806 }
4807
4808
4809 /* Function bump_vector_ptr
4810
4811    Increment a pointer (to a vector type) by vector-size. If requested,
4812    i.e. if PTR-INCR is given, then also connect the new increment stmt
4813    to the existing def-use update-chain of the pointer, by modifying
4814    the PTR_INCR as illustrated below:
4815
4816    The pointer def-use update-chain before this function:
4817                         DATAREF_PTR = phi (p_0, p_2)
4818                         ....
4819         PTR_INCR:       p_2 = DATAREF_PTR + step
4820
4821    The pointer def-use update-chain after this function:
4822                         DATAREF_PTR = phi (p_0, p_2)
4823                         ....
4824                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4825                         ....
4826         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4827
4828    Input:
4829    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4830                  in the loop.
4831    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4832               the loop.  The increment amount across iterations is expected
4833               to be vector_size.
4834    BSI - location where the new update stmt is to be placed.
4835    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4836    BUMP - optional. The offset by which to bump the pointer. If not given,
4837           the offset is assumed to be vector_size.
4838
4839    Output: Return NEW_DATAREF_PTR as illustrated above.
4840
4841 */
4842
4843 tree
4844 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4845                  stmt_vec_info stmt_info, tree bump)
4846 {
4847   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4848   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4849   tree update = TYPE_SIZE_UNIT (vectype);
4850   gassign *incr_stmt;
4851   ssa_op_iter iter;
4852   use_operand_p use_p;
4853   tree new_dataref_ptr;
4854
4855   if (bump)
4856     update = bump;
4857
4858   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4859     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4860   else
4861     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4862   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4863                                    dataref_ptr, update);
4864   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4865
4866   /* Copy the points-to information if it exists. */
4867   if (DR_PTR_INFO (dr))
4868     {
4869       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4870       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4871     }
4872
4873   if (!ptr_incr)
4874     return new_dataref_ptr;
4875
4876   /* Update the vector-pointer's cross-iteration increment.  */
4877   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4878     {
4879       tree use = USE_FROM_PTR (use_p);
4880
4881       if (use == dataref_ptr)
4882         SET_USE (use_p, new_dataref_ptr);
4883       else
4884         gcc_assert (operand_equal_p (use, update, 0));
4885     }
4886
4887   return new_dataref_ptr;
4888 }
4889
4890
4891 /* Copy memory reference info such as base/clique from the SRC reference
4892    to the DEST MEM_REF.  */
4893
4894 void
4895 vect_copy_ref_info (tree dest, tree src)
4896 {
4897   if (TREE_CODE (dest) != MEM_REF)
4898     return;
4899
4900   tree src_base = src;
4901   while (handled_component_p (src_base))
4902     src_base = TREE_OPERAND (src_base, 0);
4903   if (TREE_CODE (src_base) != MEM_REF
4904       && TREE_CODE (src_base) != TARGET_MEM_REF)
4905     return;
4906
4907   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4908   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4909 }
4910
4911
4912 /* Function vect_create_destination_var.
4913
4914    Create a new temporary of type VECTYPE.  */
4915
4916 tree
4917 vect_create_destination_var (tree scalar_dest, tree vectype)
4918 {
4919   tree vec_dest;
4920   const char *name;
4921   char *new_name;
4922   tree type;
4923   enum vect_var_kind kind;
4924
4925   kind = vectype
4926     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4927     ? vect_mask_var
4928     : vect_simple_var
4929     : vect_scalar_var;
4930   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4931
4932   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4933
4934   name = get_name (scalar_dest);
4935   if (name)
4936     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4937   else
4938     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4939   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4940   free (new_name);
4941
4942   return vec_dest;
4943 }
4944
4945 /* Function vect_grouped_store_supported.
4946
4947    Returns TRUE if interleave high and interleave low permutations
4948    are supported, and FALSE otherwise.  */
4949
4950 bool
4951 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4952 {
4953   machine_mode mode = TYPE_MODE (vectype);
4954
4955   /* vect_permute_store_chain requires the group size to be equal to 3 or
4956      be a power of two.  */
4957   if (count != 3 && exact_log2 (count) == -1)
4958     {
4959       if (dump_enabled_p ())
4960         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4961                          "the size of the group of accesses"
4962                          " is not a power of 2 or not eqaul to 3\n");
4963       return false;
4964     }
4965
4966   /* Check that the permutation is supported.  */
4967   if (VECTOR_MODE_P (mode))
4968     {
4969       unsigned int i;
4970       if (count == 3)
4971         {
4972           unsigned int j0 = 0, j1 = 0, j2 = 0;
4973           unsigned int i, j;
4974
4975           unsigned int nelt;
4976           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4977             {
4978               if (dump_enabled_p ())
4979                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4980                                  "cannot handle groups of 3 stores for"
4981                                  " variable-length vectors\n");
4982               return false;
4983             }
4984
4985           vec_perm_builder sel (nelt, nelt, 1);
4986           sel.quick_grow (nelt);
4987           vec_perm_indices indices;
4988           for (j = 0; j < 3; j++)
4989             {
4990               int nelt0 = ((3 - j) * nelt) % 3;
4991               int nelt1 = ((3 - j) * nelt + 1) % 3;
4992               int nelt2 = ((3 - j) * nelt + 2) % 3;
4993               for (i = 0; i < nelt; i++)
4994                 {
4995                   if (3 * i + nelt0 < nelt)
4996                     sel[3 * i + nelt0] = j0++;
4997                   if (3 * i + nelt1 < nelt)
4998                     sel[3 * i + nelt1] = nelt + j1++;
4999                   if (3 * i + nelt2 < nelt)
5000                     sel[3 * i + nelt2] = 0;
5001                 }
5002               indices.new_vector (sel, 2, nelt);
5003               if (!can_vec_perm_const_p (mode, indices))
5004                 {
5005                   if (dump_enabled_p ())
5006                     dump_printf (MSG_MISSED_OPTIMIZATION,
5007                                  "permutation op not supported by target.\n");
5008                   return false;
5009                 }
5010
5011               for (i = 0; i < nelt; i++)
5012                 {
5013                   if (3 * i + nelt0 < nelt)
5014                     sel[3 * i + nelt0] = 3 * i + nelt0;
5015                   if (3 * i + nelt1 < nelt)
5016                     sel[3 * i + nelt1] = 3 * i + nelt1;
5017                   if (3 * i + nelt2 < nelt)
5018                     sel[3 * i + nelt2] = nelt + j2++;
5019                 }
5020               indices.new_vector (sel, 2, nelt);
5021               if (!can_vec_perm_const_p (mode, indices))
5022                 {
5023                   if (dump_enabled_p ())
5024                     dump_printf (MSG_MISSED_OPTIMIZATION,
5025                                  "permutation op not supported by target.\n");
5026                   return false;
5027                 }
5028             }
5029           return true;
5030         }
5031       else
5032         {
5033           /* If length is not equal to 3 then only power of 2 is supported.  */
5034           gcc_assert (pow2p_hwi (count));
5035           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5036
5037           /* The encoding has 2 interleaved stepped patterns.  */
5038           vec_perm_builder sel (nelt, 2, 3);
5039           sel.quick_grow (6);
5040           for (i = 0; i < 3; i++)
5041             {
5042               sel[i * 2] = i;
5043               sel[i * 2 + 1] = i + nelt;
5044             }
5045           vec_perm_indices indices (sel, 2, nelt);
5046           if (can_vec_perm_const_p (mode, indices))
5047             {
5048               for (i = 0; i < 6; i++)
5049                 sel[i] += exact_div (nelt, 2);
5050               indices.new_vector (sel, 2, nelt);
5051               if (can_vec_perm_const_p (mode, indices))
5052                 return true;
5053             }
5054         }
5055     }
5056
5057   if (dump_enabled_p ())
5058     dump_printf (MSG_MISSED_OPTIMIZATION,
5059                  "permutation op not supported by target.\n");
5060   return false;
5061 }
5062
5063
5064 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5065    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5066
5067 bool
5068 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5069                             bool masked_p)
5070 {
5071   if (masked_p)
5072     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5073                                          vec_mask_store_lanes_optab,
5074                                          vectype, count);
5075   else
5076     return vect_lanes_optab_supported_p ("vec_store_lanes",
5077                                          vec_store_lanes_optab,
5078                                          vectype, count);
5079 }
5080
5081
5082 /* Function vect_permute_store_chain.
5083
5084    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5085    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5086    the data correctly for the stores.  Return the final references for stores
5087    in RESULT_CHAIN.
5088
5089    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5090    The input is 4 vectors each containing 8 elements.  We assign a number to
5091    each element, the input sequence is:
5092
5093    1st vec:   0  1  2  3  4  5  6  7
5094    2nd vec:   8  9 10 11 12 13 14 15
5095    3rd vec:  16 17 18 19 20 21 22 23
5096    4th vec:  24 25 26 27 28 29 30 31
5097
5098    The output sequence should be:
5099
5100    1st vec:  0  8 16 24  1  9 17 25
5101    2nd vec:  2 10 18 26  3 11 19 27
5102    3rd vec:  4 12 20 28  5 13 21 30
5103    4th vec:  6 14 22 30  7 15 23 31
5104
5105    i.e., we interleave the contents of the four vectors in their order.
5106
5107    We use interleave_high/low instructions to create such output.  The input of
5108    each interleave_high/low operation is two vectors:
5109    1st vec    2nd vec
5110    0 1 2 3    4 5 6 7
5111    the even elements of the result vector are obtained left-to-right from the
5112    high/low elements of the first vector.  The odd elements of the result are
5113    obtained left-to-right from the high/low elements of the second vector.
5114    The output of interleave_high will be:   0 4 1 5
5115    and of interleave_low:                   2 6 3 7
5116
5117
5118    The permutation is done in log LENGTH stages.  In each stage interleave_high
5119    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5120    where the first argument is taken from the first half of DR_CHAIN and the
5121    second argument from it's second half.
5122    In our example,
5123
5124    I1: interleave_high (1st vec, 3rd vec)
5125    I2: interleave_low (1st vec, 3rd vec)
5126    I3: interleave_high (2nd vec, 4th vec)
5127    I4: interleave_low (2nd vec, 4th vec)
5128
5129    The output for the first stage is:
5130
5131    I1:  0 16  1 17  2 18  3 19
5132    I2:  4 20  5 21  6 22  7 23
5133    I3:  8 24  9 25 10 26 11 27
5134    I4: 12 28 13 29 14 30 15 31
5135
5136    The output of the second stage, i.e. the final result is:
5137
5138    I1:  0  8 16 24  1  9 17 25
5139    I2:  2 10 18 26  3 11 19 27
5140    I3:  4 12 20 28  5 13 21 30
5141    I4:  6 14 22 30  7 15 23 31.  */
5142
5143 void
5144 vect_permute_store_chain (vec<tree> dr_chain,
5145                           unsigned int length,
5146                           stmt_vec_info stmt_info,
5147                           gimple_stmt_iterator *gsi,
5148                           vec<tree> *result_chain)
5149 {
5150   tree vect1, vect2, high, low;
5151   gimple *perm_stmt;
5152   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5153   tree perm_mask_low, perm_mask_high;
5154   tree data_ref;
5155   tree perm3_mask_low, perm3_mask_high;
5156   unsigned int i, j, n, log_length = exact_log2 (length);
5157
5158   result_chain->quick_grow (length);
5159   memcpy (result_chain->address (), dr_chain.address (),
5160           length * sizeof (tree));
5161
5162   if (length == 3)
5163     {
5164       /* vect_grouped_store_supported ensures that this is constant.  */
5165       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5166       unsigned int j0 = 0, j1 = 0, j2 = 0;
5167
5168       vec_perm_builder sel (nelt, nelt, 1);
5169       sel.quick_grow (nelt);
5170       vec_perm_indices indices;
5171       for (j = 0; j < 3; j++)
5172         {
5173           int nelt0 = ((3 - j) * nelt) % 3;
5174           int nelt1 = ((3 - j) * nelt + 1) % 3;
5175           int nelt2 = ((3 - j) * nelt + 2) % 3;
5176
5177           for (i = 0; i < nelt; i++)
5178             {
5179               if (3 * i + nelt0 < nelt)
5180                 sel[3 * i + nelt0] = j0++;
5181               if (3 * i + nelt1 < nelt)
5182                 sel[3 * i + nelt1] = nelt + j1++;
5183               if (3 * i + nelt2 < nelt)
5184                 sel[3 * i + nelt2] = 0;
5185             }
5186           indices.new_vector (sel, 2, nelt);
5187           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5188
5189           for (i = 0; i < nelt; i++)
5190             {
5191               if (3 * i + nelt0 < nelt)
5192                 sel[3 * i + nelt0] = 3 * i + nelt0;
5193               if (3 * i + nelt1 < nelt)
5194                 sel[3 * i + nelt1] = 3 * i + nelt1;
5195               if (3 * i + nelt2 < nelt)
5196                 sel[3 * i + nelt2] = nelt + j2++;
5197             }
5198           indices.new_vector (sel, 2, nelt);
5199           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5200
5201           vect1 = dr_chain[0];
5202           vect2 = dr_chain[1];
5203
5204           /* Create interleaving stmt:
5205              low = VEC_PERM_EXPR <vect1, vect2,
5206                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5207                                    j + 2, nelt + j + 2, *, ...}>  */
5208           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5209           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5210                                            vect2, perm3_mask_low);
5211           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5212
5213           vect1 = data_ref;
5214           vect2 = dr_chain[2];
5215           /* Create interleaving stmt:
5216              low = VEC_PERM_EXPR <vect1, vect2,
5217                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5218                                    6, 7, nelt + j + 2, ...}>  */
5219           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5220           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5221                                            vect2, perm3_mask_high);
5222           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5223           (*result_chain)[j] = data_ref;
5224         }
5225     }
5226   else
5227     {
5228       /* If length is not equal to 3 then only power of 2 is supported.  */
5229       gcc_assert (pow2p_hwi (length));
5230
5231       /* The encoding has 2 interleaved stepped patterns.  */
5232       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5233       vec_perm_builder sel (nelt, 2, 3);
5234       sel.quick_grow (6);
5235       for (i = 0; i < 3; i++)
5236         {
5237           sel[i * 2] = i;
5238           sel[i * 2 + 1] = i + nelt;
5239         }
5240         vec_perm_indices indices (sel, 2, nelt);
5241         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5242
5243         for (i = 0; i < 6; i++)
5244           sel[i] += exact_div (nelt, 2);
5245         indices.new_vector (sel, 2, nelt);
5246         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5247
5248         for (i = 0, n = log_length; i < n; i++)
5249           {
5250             for (j = 0; j < length/2; j++)
5251               {
5252                 vect1 = dr_chain[j];
5253                 vect2 = dr_chain[j+length/2];
5254
5255                 /* Create interleaving stmt:
5256                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5257                                                         ...}>  */
5258                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5259                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5260                                                  vect2, perm_mask_high);
5261                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5262                 (*result_chain)[2*j] = high;
5263
5264                 /* Create interleaving stmt:
5265                    low = VEC_PERM_EXPR <vect1, vect2,
5266                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5267                                          ...}>  */
5268                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5269                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5270                                                  vect2, perm_mask_low);
5271                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5272                 (*result_chain)[2*j+1] = low;
5273               }
5274             memcpy (dr_chain.address (), result_chain->address (),
5275                     length * sizeof (tree));
5276           }
5277     }
5278 }
5279
5280 /* Function vect_setup_realignment
5281
5282    This function is called when vectorizing an unaligned load using
5283    the dr_explicit_realign[_optimized] scheme.
5284    This function generates the following code at the loop prolog:
5285
5286       p = initial_addr;
5287    x  msq_init = *(floor(p));   # prolog load
5288       realignment_token = call target_builtin;
5289     loop:
5290    x  msq = phi (msq_init, ---)
5291
5292    The stmts marked with x are generated only for the case of
5293    dr_explicit_realign_optimized.
5294
5295    The code above sets up a new (vector) pointer, pointing to the first
5296    location accessed by STMT_INFO, and a "floor-aligned" load using that
5297    pointer.  It also generates code to compute the "realignment-token"
5298    (if the relevant target hook was defined), and creates a phi-node at the
5299    loop-header bb whose arguments are the result of the prolog-load (created
5300    by this function) and the result of a load that takes place in the loop
5301    (to be created by the caller to this function).
5302
5303    For the case of dr_explicit_realign_optimized:
5304    The caller to this function uses the phi-result (msq) to create the
5305    realignment code inside the loop, and sets up the missing phi argument,
5306    as follows:
5307     loop:
5308       msq = phi (msq_init, lsq)
5309       lsq = *(floor(p'));        # load in loop
5310       result = realign_load (msq, lsq, realignment_token);
5311
5312    For the case of dr_explicit_realign:
5313     loop:
5314       msq = *(floor(p));        # load in loop
5315       p' = p + (VS-1);
5316       lsq = *(floor(p'));       # load in loop
5317       result = realign_load (msq, lsq, realignment_token);
5318
5319    Input:
5320    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5321                a memory location that may be unaligned.
5322    BSI - place where new code is to be inserted.
5323    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5324                               is used.
5325
5326    Output:
5327    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5328                        target hook, if defined.
5329    Return value - the result of the loop-header phi node.  */
5330
5331 tree
5332 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5333                         tree *realignment_token,
5334                         enum dr_alignment_support alignment_support_scheme,
5335                         tree init_addr,
5336                         struct loop **at_loop)
5337 {
5338   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5339   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5340   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5341   struct data_reference *dr = dr_info->dr;
5342   struct loop *loop = NULL;
5343   edge pe = NULL;
5344   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5345   tree vec_dest;
5346   gimple *inc;
5347   tree ptr;
5348   tree data_ref;
5349   basic_block new_bb;
5350   tree msq_init = NULL_TREE;
5351   tree new_temp;
5352   gphi *phi_stmt;
5353   tree msq = NULL_TREE;
5354   gimple_seq stmts = NULL;
5355   bool compute_in_loop = false;
5356   bool nested_in_vect_loop = false;
5357   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5358   struct loop *loop_for_initial_load = NULL;
5359
5360   if (loop_vinfo)
5361     {
5362       loop = LOOP_VINFO_LOOP (loop_vinfo);
5363       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5364     }
5365
5366   gcc_assert (alignment_support_scheme == dr_explicit_realign
5367               || alignment_support_scheme == dr_explicit_realign_optimized);
5368
5369   /* We need to generate three things:
5370      1. the misalignment computation
5371      2. the extra vector load (for the optimized realignment scheme).
5372      3. the phi node for the two vectors from which the realignment is
5373       done (for the optimized realignment scheme).  */
5374
5375   /* 1. Determine where to generate the misalignment computation.
5376
5377      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5378      calculation will be generated by this function, outside the loop (in the
5379      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5380      caller, inside the loop.
5381
5382      Background: If the misalignment remains fixed throughout the iterations of
5383      the loop, then both realignment schemes are applicable, and also the
5384      misalignment computation can be done outside LOOP.  This is because we are
5385      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5386      are a multiple of VS (the Vector Size), and therefore the misalignment in
5387      different vectorized LOOP iterations is always the same.
5388      The problem arises only if the memory access is in an inner-loop nested
5389      inside LOOP, which is now being vectorized using outer-loop vectorization.
5390      This is the only case when the misalignment of the memory access may not
5391      remain fixed throughout the iterations of the inner-loop (as explained in
5392      detail in vect_supportable_dr_alignment).  In this case, not only is the
5393      optimized realignment scheme not applicable, but also the misalignment
5394      computation (and generation of the realignment token that is passed to
5395      REALIGN_LOAD) have to be done inside the loop.
5396
5397      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5398      or not, which in turn determines if the misalignment is computed inside
5399      the inner-loop, or outside LOOP.  */
5400
5401   if (init_addr != NULL_TREE || !loop_vinfo)
5402     {
5403       compute_in_loop = true;
5404       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5405     }
5406
5407
5408   /* 2. Determine where to generate the extra vector load.
5409
5410      For the optimized realignment scheme, instead of generating two vector
5411      loads in each iteration, we generate a single extra vector load in the
5412      preheader of the loop, and in each iteration reuse the result of the
5413      vector load from the previous iteration.  In case the memory access is in
5414      an inner-loop nested inside LOOP, which is now being vectorized using
5415      outer-loop vectorization, we need to determine whether this initial vector
5416      load should be generated at the preheader of the inner-loop, or can be
5417      generated at the preheader of LOOP.  If the memory access has no evolution
5418      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5419      to be generated inside LOOP (in the preheader of the inner-loop).  */
5420
5421   if (nested_in_vect_loop)
5422     {
5423       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5424       bool invariant_in_outerloop =
5425             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5426       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5427     }
5428   else
5429     loop_for_initial_load = loop;
5430   if (at_loop)
5431     *at_loop = loop_for_initial_load;
5432
5433   if (loop_for_initial_load)
5434     pe = loop_preheader_edge (loop_for_initial_load);
5435
5436   /* 3. For the case of the optimized realignment, create the first vector
5437       load at the loop preheader.  */
5438
5439   if (alignment_support_scheme == dr_explicit_realign_optimized)
5440     {
5441       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5442       gassign *new_stmt;
5443
5444       gcc_assert (!compute_in_loop);
5445       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5446       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5447                                       loop_for_initial_load, NULL_TREE,
5448                                       &init_addr, NULL, &inc, true);
5449       if (TREE_CODE (ptr) == SSA_NAME)
5450         new_temp = copy_ssa_name (ptr);
5451       else
5452         new_temp = make_ssa_name (TREE_TYPE (ptr));
5453       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5454       tree type = TREE_TYPE (ptr);
5455       new_stmt = gimple_build_assign
5456                    (new_temp, BIT_AND_EXPR, ptr,
5457                     fold_build2 (MINUS_EXPR, type,
5458                                  build_int_cst (type, 0),
5459                                  build_int_cst (type, align)));
5460       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5461       gcc_assert (!new_bb);
5462       data_ref
5463         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5464                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5465       vect_copy_ref_info (data_ref, DR_REF (dr));
5466       new_stmt = gimple_build_assign (vec_dest, data_ref);
5467       new_temp = make_ssa_name (vec_dest, new_stmt);
5468       gimple_assign_set_lhs (new_stmt, new_temp);
5469       if (pe)
5470         {
5471           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5472           gcc_assert (!new_bb);
5473         }
5474       else
5475          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5476
5477       msq_init = gimple_assign_lhs (new_stmt);
5478     }
5479
5480   /* 4. Create realignment token using a target builtin, if available.
5481       It is done either inside the containing loop, or before LOOP (as
5482       determined above).  */
5483
5484   if (targetm.vectorize.builtin_mask_for_load)
5485     {
5486       gcall *new_stmt;
5487       tree builtin_decl;
5488
5489       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5490       if (!init_addr)
5491         {
5492           /* Generate the INIT_ADDR computation outside LOOP.  */
5493           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5494                                                             NULL_TREE);
5495           if (loop)
5496             {
5497               pe = loop_preheader_edge (loop);
5498               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5499               gcc_assert (!new_bb);
5500             }
5501           else
5502              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5503         }
5504
5505       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5506       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5507       vec_dest =
5508         vect_create_destination_var (scalar_dest,
5509                                      gimple_call_return_type (new_stmt));
5510       new_temp = make_ssa_name (vec_dest, new_stmt);
5511       gimple_call_set_lhs (new_stmt, new_temp);
5512
5513       if (compute_in_loop)
5514         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5515       else
5516         {
5517           /* Generate the misalignment computation outside LOOP.  */
5518           pe = loop_preheader_edge (loop);
5519           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5520           gcc_assert (!new_bb);
5521         }
5522
5523       *realignment_token = gimple_call_lhs (new_stmt);
5524
5525       /* The result of the CALL_EXPR to this builtin is determined from
5526          the value of the parameter and no global variables are touched
5527          which makes the builtin a "const" function.  Requiring the
5528          builtin to have the "const" attribute makes it unnecessary
5529          to call mark_call_clobbered.  */
5530       gcc_assert (TREE_READONLY (builtin_decl));
5531     }
5532
5533   if (alignment_support_scheme == dr_explicit_realign)
5534     return msq;
5535
5536   gcc_assert (!compute_in_loop);
5537   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5538
5539
5540   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5541
5542   pe = loop_preheader_edge (containing_loop);
5543   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5544   msq = make_ssa_name (vec_dest);
5545   phi_stmt = create_phi_node (msq, containing_loop->header);
5546   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5547
5548   return msq;
5549 }
5550
5551
5552 /* Function vect_grouped_load_supported.
5553
5554    COUNT is the size of the load group (the number of statements plus the
5555    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5556    only one statement, with a gap of COUNT - 1.
5557
5558    Returns true if a suitable permute exists.  */
5559
5560 bool
5561 vect_grouped_load_supported (tree vectype, bool single_element_p,
5562                              unsigned HOST_WIDE_INT count)
5563 {
5564   machine_mode mode = TYPE_MODE (vectype);
5565
5566   /* If this is single-element interleaving with an element distance
5567      that leaves unused vector loads around punt - we at least create
5568      very sub-optimal code in that case (and blow up memory,
5569      see PR65518).  */
5570   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5571     {
5572       if (dump_enabled_p ())
5573         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5574                          "single-element interleaving not supported "
5575                          "for not adjacent vector loads\n");
5576       return false;
5577     }
5578
5579   /* vect_permute_load_chain requires the group size to be equal to 3 or
5580      be a power of two.  */
5581   if (count != 3 && exact_log2 (count) == -1)
5582     {
5583       if (dump_enabled_p ())
5584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5585                          "the size of the group of accesses"
5586                          " is not a power of 2 or not equal to 3\n");
5587       return false;
5588     }
5589
5590   /* Check that the permutation is supported.  */
5591   if (VECTOR_MODE_P (mode))
5592     {
5593       unsigned int i, j;
5594       if (count == 3)
5595         {
5596           unsigned int nelt;
5597           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5598             {
5599               if (dump_enabled_p ())
5600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5601                                  "cannot handle groups of 3 loads for"
5602                                  " variable-length vectors\n");
5603               return false;
5604             }
5605
5606           vec_perm_builder sel (nelt, nelt, 1);
5607           sel.quick_grow (nelt);
5608           vec_perm_indices indices;
5609           unsigned int k;
5610           for (k = 0; k < 3; k++)
5611             {
5612               for (i = 0; i < nelt; i++)
5613                 if (3 * i + k < 2 * nelt)
5614                   sel[i] = 3 * i + k;
5615                 else
5616                   sel[i] = 0;
5617               indices.new_vector (sel, 2, nelt);
5618               if (!can_vec_perm_const_p (mode, indices))
5619                 {
5620                   if (dump_enabled_p ())
5621                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622                                      "shuffle of 3 loads is not supported by"
5623                                      " target\n");
5624                   return false;
5625                 }
5626               for (i = 0, j = 0; i < nelt; i++)
5627                 if (3 * i + k < 2 * nelt)
5628                   sel[i] = i;
5629                 else
5630                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5631               indices.new_vector (sel, 2, nelt);
5632               if (!can_vec_perm_const_p (mode, indices))
5633                 {
5634                   if (dump_enabled_p ())
5635                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5636                                      "shuffle of 3 loads is not supported by"
5637                                      " target\n");
5638                   return false;
5639                 }
5640             }
5641           return true;
5642         }
5643       else
5644         {
5645           /* If length is not equal to 3 then only power of 2 is supported.  */
5646           gcc_assert (pow2p_hwi (count));
5647           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5648
5649           /* The encoding has a single stepped pattern.  */
5650           vec_perm_builder sel (nelt, 1, 3);
5651           sel.quick_grow (3);
5652           for (i = 0; i < 3; i++)
5653             sel[i] = i * 2;
5654           vec_perm_indices indices (sel, 2, nelt);
5655           if (can_vec_perm_const_p (mode, indices))
5656             {
5657               for (i = 0; i < 3; i++)
5658                 sel[i] = i * 2 + 1;
5659               indices.new_vector (sel, 2, nelt);
5660               if (can_vec_perm_const_p (mode, indices))
5661                 return true;
5662             }
5663         }
5664     }
5665
5666   if (dump_enabled_p ())
5667     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668                      "extract even/odd not supported by target\n");
5669   return false;
5670 }
5671
5672 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5673    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5674
5675 bool
5676 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5677                            bool masked_p)
5678 {
5679   if (masked_p)
5680     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5681                                          vec_mask_load_lanes_optab,
5682                                          vectype, count);
5683   else
5684     return vect_lanes_optab_supported_p ("vec_load_lanes",
5685                                          vec_load_lanes_optab,
5686                                          vectype, count);
5687 }
5688
5689 /* Function vect_permute_load_chain.
5690
5691    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5692    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5693    the input data correctly.  Return the final references for loads in
5694    RESULT_CHAIN.
5695
5696    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5697    The input is 4 vectors each containing 8 elements. We assign a number to each
5698    element, the input sequence is:
5699
5700    1st vec:   0  1  2  3  4  5  6  7
5701    2nd vec:   8  9 10 11 12 13 14 15
5702    3rd vec:  16 17 18 19 20 21 22 23
5703    4th vec:  24 25 26 27 28 29 30 31
5704
5705    The output sequence should be:
5706
5707    1st vec:  0 4  8 12 16 20 24 28
5708    2nd vec:  1 5  9 13 17 21 25 29
5709    3rd vec:  2 6 10 14 18 22 26 30
5710    4th vec:  3 7 11 15 19 23 27 31
5711
5712    i.e., the first output vector should contain the first elements of each
5713    interleaving group, etc.
5714
5715    We use extract_even/odd instructions to create such output.  The input of
5716    each extract_even/odd operation is two vectors
5717    1st vec    2nd vec
5718    0 1 2 3    4 5 6 7
5719
5720    and the output is the vector of extracted even/odd elements.  The output of
5721    extract_even will be:   0 2 4 6
5722    and of extract_odd:     1 3 5 7
5723
5724
5725    The permutation is done in log LENGTH stages.  In each stage extract_even
5726    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5727    their order.  In our example,
5728
5729    E1: extract_even (1st vec, 2nd vec)
5730    E2: extract_odd (1st vec, 2nd vec)
5731    E3: extract_even (3rd vec, 4th vec)
5732    E4: extract_odd (3rd vec, 4th vec)
5733
5734    The output for the first stage will be:
5735
5736    E1:  0  2  4  6  8 10 12 14
5737    E2:  1  3  5  7  9 11 13 15
5738    E3: 16 18 20 22 24 26 28 30
5739    E4: 17 19 21 23 25 27 29 31
5740
5741    In order to proceed and create the correct sequence for the next stage (or
5742    for the correct output, if the second stage is the last one, as in our
5743    example), we first put the output of extract_even operation and then the
5744    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5745    The input for the second stage is:
5746
5747    1st vec (E1):  0  2  4  6  8 10 12 14
5748    2nd vec (E3): 16 18 20 22 24 26 28 30
5749    3rd vec (E2):  1  3  5  7  9 11 13 15
5750    4th vec (E4): 17 19 21 23 25 27 29 31
5751
5752    The output of the second stage:
5753
5754    E1: 0 4  8 12 16 20 24 28
5755    E2: 2 6 10 14 18 22 26 30
5756    E3: 1 5  9 13 17 21 25 29
5757    E4: 3 7 11 15 19 23 27 31
5758
5759    And RESULT_CHAIN after reordering:
5760
5761    1st vec (E1):  0 4  8 12 16 20 24 28
5762    2nd vec (E3):  1 5  9 13 17 21 25 29
5763    3rd vec (E2):  2 6 10 14 18 22 26 30
5764    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5765
5766 static void
5767 vect_permute_load_chain (vec<tree> dr_chain,
5768                          unsigned int length,
5769                          stmt_vec_info stmt_info,
5770                          gimple_stmt_iterator *gsi,
5771                          vec<tree> *result_chain)
5772 {
5773   tree data_ref, first_vect, second_vect;
5774   tree perm_mask_even, perm_mask_odd;
5775   tree perm3_mask_low, perm3_mask_high;
5776   gimple *perm_stmt;
5777   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5778   unsigned int i, j, log_length = exact_log2 (length);
5779
5780   result_chain->quick_grow (length);
5781   memcpy (result_chain->address (), dr_chain.address (),
5782           length * sizeof (tree));
5783
5784   if (length == 3)
5785     {
5786       /* vect_grouped_load_supported ensures that this is constant.  */
5787       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5788       unsigned int k;
5789
5790       vec_perm_builder sel (nelt, nelt, 1);
5791       sel.quick_grow (nelt);
5792       vec_perm_indices indices;
5793       for (k = 0; k < 3; k++)
5794         {
5795           for (i = 0; i < nelt; i++)
5796             if (3 * i + k < 2 * nelt)
5797               sel[i] = 3 * i + k;
5798             else
5799               sel[i] = 0;
5800           indices.new_vector (sel, 2, nelt);
5801           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5802
5803           for (i = 0, j = 0; i < nelt; i++)
5804             if (3 * i + k < 2 * nelt)
5805               sel[i] = i;
5806             else
5807               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5808           indices.new_vector (sel, 2, nelt);
5809           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5810
5811           first_vect = dr_chain[0];
5812           second_vect = dr_chain[1];
5813
5814           /* Create interleaving stmt (low part of):
5815              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5816                                                              ...}>  */
5817           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5818           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5819                                            second_vect, perm3_mask_low);
5820           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5821
5822           /* Create interleaving stmt (high part of):
5823              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5824                                                               ...}>  */
5825           first_vect = data_ref;
5826           second_vect = dr_chain[2];
5827           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5828           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5829                                            second_vect, perm3_mask_high);
5830           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5831           (*result_chain)[k] = data_ref;
5832         }
5833     }
5834   else
5835     {
5836       /* If length is not equal to 3 then only power of 2 is supported.  */
5837       gcc_assert (pow2p_hwi (length));
5838
5839       /* The encoding has a single stepped pattern.  */
5840       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5841       vec_perm_builder sel (nelt, 1, 3);
5842       sel.quick_grow (3);
5843       for (i = 0; i < 3; ++i)
5844         sel[i] = i * 2;
5845       vec_perm_indices indices (sel, 2, nelt);
5846       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5847
5848       for (i = 0; i < 3; ++i)
5849         sel[i] = i * 2 + 1;
5850       indices.new_vector (sel, 2, nelt);
5851       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5852
5853       for (i = 0; i < log_length; i++)
5854         {
5855           for (j = 0; j < length; j += 2)
5856             {
5857               first_vect = dr_chain[j];
5858               second_vect = dr_chain[j+1];
5859
5860               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5861               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5862               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5863                                                first_vect, second_vect,
5864                                                perm_mask_even);
5865               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5866               (*result_chain)[j/2] = data_ref;
5867
5868               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5869               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5870               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5871                                                first_vect, second_vect,
5872                                                perm_mask_odd);
5873               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5874               (*result_chain)[j/2+length/2] = data_ref;
5875             }
5876           memcpy (dr_chain.address (), result_chain->address (),
5877                   length * sizeof (tree));
5878         }
5879     }
5880 }
5881
5882 /* Function vect_shift_permute_load_chain.
5883
5884    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5885    sequence of stmts to reorder the input data accordingly.
5886    Return the final references for loads in RESULT_CHAIN.
5887    Return true if successed, false otherwise.
5888
5889    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5890    The input is 3 vectors each containing 8 elements.  We assign a
5891    number to each element, the input sequence is:
5892
5893    1st vec:   0  1  2  3  4  5  6  7
5894    2nd vec:   8  9 10 11 12 13 14 15
5895    3rd vec:  16 17 18 19 20 21 22 23
5896
5897    The output sequence should be:
5898
5899    1st vec:  0 3 6  9 12 15 18 21
5900    2nd vec:  1 4 7 10 13 16 19 22
5901    3rd vec:  2 5 8 11 14 17 20 23
5902
5903    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5904
5905    First we shuffle all 3 vectors to get correct elements order:
5906
5907    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5908    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5909    3rd vec:  (16 19 22) (17 20 23) (18 21)
5910
5911    Next we unite and shift vector 3 times:
5912
5913    1st step:
5914      shift right by 6 the concatenation of:
5915      "1st vec" and  "2nd vec"
5916        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5917      "2nd vec" and  "3rd vec"
5918        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5919      "3rd vec" and  "1st vec"
5920        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5921                              | New vectors                   |
5922
5923      So that now new vectors are:
5924
5925      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5926      2nd vec:  (10 13) (16 19 22) (17 20 23)
5927      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5928
5929    2nd step:
5930      shift right by 5 the concatenation of:
5931      "1st vec" and  "3rd vec"
5932        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5933      "2nd vec" and  "1st vec"
5934        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5935      "3rd vec" and  "2nd vec"
5936        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5937                           | New vectors                   |
5938
5939      So that now new vectors are:
5940
5941      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5942      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5943      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5944
5945    3rd step:
5946      shift right by 5 the concatenation of:
5947      "1st vec" and  "1st vec"
5948        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5949      shift right by 3 the concatenation of:
5950      "2nd vec" and  "2nd vec"
5951                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5952                           | New vectors                   |
5953
5954      So that now all vectors are READY:
5955      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5956      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5957      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5958
5959    This algorithm is faster than one in vect_permute_load_chain if:
5960      1.  "shift of a concatination" is faster than general permutation.
5961          This is usually so.
5962      2.  The TARGET machine can't execute vector instructions in parallel.
5963          This is because each step of the algorithm depends on previous.
5964          The algorithm in vect_permute_load_chain is much more parallel.
5965
5966    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5967 */
5968
5969 static bool
5970 vect_shift_permute_load_chain (vec<tree> dr_chain,
5971                                unsigned int length,
5972                                stmt_vec_info stmt_info,
5973                                gimple_stmt_iterator *gsi,
5974                                vec<tree> *result_chain)
5975 {
5976   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5977   tree perm2_mask1, perm2_mask2, perm3_mask;
5978   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5979   gimple *perm_stmt;
5980
5981   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5982   unsigned int i;
5983   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5984
5985   unsigned HOST_WIDE_INT nelt, vf;
5986   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5987       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5988     /* Not supported for variable-length vectors.  */
5989     return false;
5990
5991   vec_perm_builder sel (nelt, nelt, 1);
5992   sel.quick_grow (nelt);
5993
5994   result_chain->quick_grow (length);
5995   memcpy (result_chain->address (), dr_chain.address (),
5996           length * sizeof (tree));
5997
5998   if (pow2p_hwi (length) && vf > 4)
5999     {
6000       unsigned int j, log_length = exact_log2 (length);
6001       for (i = 0; i < nelt / 2; ++i)
6002         sel[i] = i * 2;
6003       for (i = 0; i < nelt / 2; ++i)
6004         sel[nelt / 2 + i] = i * 2 + 1;
6005       vec_perm_indices indices (sel, 2, nelt);
6006       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6007         {
6008           if (dump_enabled_p ())
6009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6010                              "shuffle of 2 fields structure is not \
6011                               supported by target\n");
6012           return false;
6013         }
6014       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6015
6016       for (i = 0; i < nelt / 2; ++i)
6017         sel[i] = i * 2 + 1;
6018       for (i = 0; i < nelt / 2; ++i)
6019         sel[nelt / 2 + i] = i * 2;
6020       indices.new_vector (sel, 2, nelt);
6021       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6022         {
6023           if (dump_enabled_p ())
6024             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6025                              "shuffle of 2 fields structure is not \
6026                               supported by target\n");
6027           return false;
6028         }
6029       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6030
6031       /* Generating permutation constant to shift all elements.
6032          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6033       for (i = 0; i < nelt; i++)
6034         sel[i] = nelt / 2 + i;
6035       indices.new_vector (sel, 2, nelt);
6036       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6037         {
6038           if (dump_enabled_p ())
6039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6040                              "shift permutation is not supported by target\n");
6041           return false;
6042         }
6043       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6044
6045       /* Generating permutation constant to select vector from 2.
6046          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6047       for (i = 0; i < nelt / 2; i++)
6048         sel[i] = i;
6049       for (i = nelt / 2; i < nelt; i++)
6050         sel[i] = nelt + i;
6051       indices.new_vector (sel, 2, nelt);
6052       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6053         {
6054           if (dump_enabled_p ())
6055             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6056                              "select is not supported by target\n");
6057           return false;
6058         }
6059       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6060
6061       for (i = 0; i < log_length; i++)
6062         {
6063           for (j = 0; j < length; j += 2)
6064             {
6065               first_vect = dr_chain[j];
6066               second_vect = dr_chain[j + 1];
6067
6068               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6069               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6070                                                first_vect, first_vect,
6071                                                perm2_mask1);
6072               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6073               vect[0] = data_ref;
6074
6075               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6076               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6077                                                second_vect, second_vect,
6078                                                perm2_mask2);
6079               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6080               vect[1] = data_ref;
6081
6082               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6083               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6084                                                vect[0], vect[1], shift1_mask);
6085               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6086               (*result_chain)[j/2 + length/2] = data_ref;
6087
6088               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6089               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6090                                                vect[0], vect[1], select_mask);
6091               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6092               (*result_chain)[j/2] = data_ref;
6093             }
6094           memcpy (dr_chain.address (), result_chain->address (),
6095                   length * sizeof (tree));
6096         }
6097       return true;
6098     }
6099   if (length == 3 && vf > 2)
6100     {
6101       unsigned int k = 0, l = 0;
6102
6103       /* Generating permutation constant to get all elements in rigth order.
6104          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6105       for (i = 0; i < nelt; i++)
6106         {
6107           if (3 * k + (l % 3) >= nelt)
6108             {
6109               k = 0;
6110               l += (3 - (nelt % 3));
6111             }
6112           sel[i] = 3 * k + (l % 3);
6113           k++;
6114         }
6115       vec_perm_indices indices (sel, 2, nelt);
6116       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6117         {
6118           if (dump_enabled_p ())
6119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120                              "shuffle of 3 fields structure is not \
6121                               supported by target\n");
6122           return false;
6123         }
6124       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6125
6126       /* Generating permutation constant to shift all elements.
6127          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6128       for (i = 0; i < nelt; i++)
6129         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6130       indices.new_vector (sel, 2, nelt);
6131       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132         {
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "shift permutation is not supported by target\n");
6136           return false;
6137         }
6138       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6139
6140       /* Generating permutation constant to shift all elements.
6141          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6142       for (i = 0; i < nelt; i++)
6143         sel[i] = 2 * (nelt / 3) + 1 + i;
6144       indices.new_vector (sel, 2, nelt);
6145       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6146         {
6147           if (dump_enabled_p ())
6148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6149                              "shift permutation is not supported by target\n");
6150           return false;
6151         }
6152       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6153
6154       /* Generating permutation constant to shift all elements.
6155          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6156       for (i = 0; i < nelt; i++)
6157         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6158       indices.new_vector (sel, 2, nelt);
6159       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6160         {
6161           if (dump_enabled_p ())
6162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163                              "shift permutation is not supported by target\n");
6164           return false;
6165         }
6166       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6167
6168       /* Generating permutation constant to shift all elements.
6169          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6170       for (i = 0; i < nelt; i++)
6171         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6172       indices.new_vector (sel, 2, nelt);
6173       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6174         {
6175           if (dump_enabled_p ())
6176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6177                              "shift permutation is not supported by target\n");
6178           return false;
6179         }
6180       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6181
6182       for (k = 0; k < 3; k++)
6183         {
6184           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6185           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6186                                            dr_chain[k], dr_chain[k],
6187                                            perm3_mask);
6188           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6189           vect[k] = data_ref;
6190         }
6191
6192       for (k = 0; k < 3; k++)
6193         {
6194           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6195           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6196                                            vect[k % 3], vect[(k + 1) % 3],
6197                                            shift1_mask);
6198           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6199           vect_shift[k] = data_ref;
6200         }
6201
6202       for (k = 0; k < 3; k++)
6203         {
6204           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6205           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206                                            vect_shift[(4 - k) % 3],
6207                                            vect_shift[(3 - k) % 3],
6208                                            shift2_mask);
6209           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6210           vect[k] = data_ref;
6211         }
6212
6213       (*result_chain)[3 - (nelt % 3)] = vect[2];
6214
6215       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6216       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6217                                        vect[0], shift3_mask);
6218       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6219       (*result_chain)[nelt % 3] = data_ref;
6220
6221       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6222       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6223                                        vect[1], shift4_mask);
6224       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6225       (*result_chain)[0] = data_ref;
6226       return true;
6227     }
6228   return false;
6229 }
6230
6231 /* Function vect_transform_grouped_load.
6232
6233    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6234    to perform their permutation and ascribe the result vectorized statements to
6235    the scalar statements.
6236 */
6237
6238 void
6239 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6240                              int size, gimple_stmt_iterator *gsi)
6241 {
6242   machine_mode mode;
6243   vec<tree> result_chain = vNULL;
6244
6245   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6246      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6247      vectors, that are ready for vector computation.  */
6248   result_chain.create (size);
6249
6250   /* If reassociation width for vector type is 2 or greater target machine can
6251      execute 2 or more vector instructions in parallel.  Otherwise try to
6252      get chain for loads group using vect_shift_permute_load_chain.  */
6253   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6254   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6255       || pow2p_hwi (size)
6256       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6257                                          gsi, &result_chain))
6258     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6259   vect_record_grouped_load_vectors (stmt_info, result_chain);
6260   result_chain.release ();
6261 }
6262
6263 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6264    generated as part of the vectorization of STMT_INFO.  Assign the statement
6265    for each vector to the associated scalar statement.  */
6266
6267 void
6268 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6269                                   vec<tree> result_chain)
6270 {
6271   vec_info *vinfo = stmt_info->vinfo;
6272   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6273   unsigned int i, gap_count;
6274   tree tmp_data_ref;
6275
6276   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6277      Since we scan the chain starting from it's first node, their order
6278      corresponds the order of data-refs in RESULT_CHAIN.  */
6279   stmt_vec_info next_stmt_info = first_stmt_info;
6280   gap_count = 1;
6281   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6282     {
6283       if (!next_stmt_info)
6284         break;
6285
6286       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6287        code elimination pass later.  No need to check for the first stmt in
6288        the group, since it always exists.
6289        DR_GROUP_GAP is the number of steps in elements from the previous
6290        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6291        correspond to the gaps.  */
6292       if (next_stmt_info != first_stmt_info
6293           && gap_count < DR_GROUP_GAP (next_stmt_info))
6294       {
6295         gap_count++;
6296         continue;
6297       }
6298
6299       while (next_stmt_info)
6300         {
6301           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6302           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6303              copies, and we put the new vector statement in the first available
6304              RELATED_STMT.  */
6305           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6306             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6307           else
6308             {
6309               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6310                 {
6311                   stmt_vec_info prev_stmt_info
6312                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6313                   stmt_vec_info rel_stmt_info
6314                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6315                   while (rel_stmt_info)
6316                     {
6317                       prev_stmt_info = rel_stmt_info;
6318                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6319                     }
6320
6321                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6322                 }
6323             }
6324
6325           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6326           gap_count = 1;
6327           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6328              put the same TMP_DATA_REF as its vectorized statement; otherwise
6329              get the next data-ref from RESULT_CHAIN.  */
6330           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6331             break;
6332         }
6333     }
6334 }
6335
6336 /* Function vect_force_dr_alignment_p.
6337
6338    Returns whether the alignment of a DECL can be forced to be aligned
6339    on ALIGNMENT bit boundary.  */
6340
6341 bool
6342 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6343 {
6344   if (!VAR_P (decl))
6345     return false;
6346
6347   if (decl_in_symtab_p (decl)
6348       && !symtab_node::get (decl)->can_increase_alignment_p ())
6349     return false;
6350
6351   if (TREE_STATIC (decl))
6352     return (known_le (alignment,
6353                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6354   else
6355     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6356 }
6357
6358
6359 /* Return whether the data reference DR_INFO is supported with respect to its
6360    alignment.
6361    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6362    it is aligned, i.e., check if it is possible to vectorize it with different
6363    alignment.  */
6364
6365 enum dr_alignment_support
6366 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6367                                bool check_aligned_accesses)
6368 {
6369   data_reference *dr = dr_info->dr;
6370   stmt_vec_info stmt_info = dr_info->stmt;
6371   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6372   machine_mode mode = TYPE_MODE (vectype);
6373   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6374   struct loop *vect_loop = NULL;
6375   bool nested_in_vect_loop = false;
6376
6377   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6378     return dr_aligned;
6379
6380   /* For now assume all conditional loads/stores support unaligned
6381      access without any special code.  */
6382   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6383     if (gimple_call_internal_p (stmt)
6384         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6385             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6386       return dr_unaligned_supported;
6387
6388   if (loop_vinfo)
6389     {
6390       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6391       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6392     }
6393
6394   /* Possibly unaligned access.  */
6395
6396   /* We can choose between using the implicit realignment scheme (generating
6397      a misaligned_move stmt) and the explicit realignment scheme (generating
6398      aligned loads with a REALIGN_LOAD).  There are two variants to the
6399      explicit realignment scheme: optimized, and unoptimized.
6400      We can optimize the realignment only if the step between consecutive
6401      vector loads is equal to the vector size.  Since the vector memory
6402      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6403      is guaranteed that the misalignment amount remains the same throughout the
6404      execution of the vectorized loop.  Therefore, we can create the
6405      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6406      at the loop preheader.
6407
6408      However, in the case of outer-loop vectorization, when vectorizing a
6409      memory access in the inner-loop nested within the LOOP that is now being
6410      vectorized, while it is guaranteed that the misalignment of the
6411      vectorized memory access will remain the same in different outer-loop
6412      iterations, it is *not* guaranteed that is will remain the same throughout
6413      the execution of the inner-loop.  This is because the inner-loop advances
6414      with the original scalar step (and not in steps of VS).  If the inner-loop
6415      step happens to be a multiple of VS, then the misalignment remains fixed
6416      and we can use the optimized realignment scheme.  For example:
6417
6418       for (i=0; i<N; i++)
6419         for (j=0; j<M; j++)
6420           s += a[i+j];
6421
6422      When vectorizing the i-loop in the above example, the step between
6423      consecutive vector loads is 1, and so the misalignment does not remain
6424      fixed across the execution of the inner-loop, and the realignment cannot
6425      be optimized (as illustrated in the following pseudo vectorized loop):
6426
6427       for (i=0; i<N; i+=4)
6428         for (j=0; j<M; j++){
6429           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6430                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6431                          // (assuming that we start from an aligned address).
6432           }
6433
6434      We therefore have to use the unoptimized realignment scheme:
6435
6436       for (i=0; i<N; i+=4)
6437           for (j=k; j<M; j+=4)
6438           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6439                            // that the misalignment of the initial address is
6440                            // 0).
6441
6442      The loop can then be vectorized as follows:
6443
6444       for (k=0; k<4; k++){
6445         rt = get_realignment_token (&vp[k]);
6446         for (i=0; i<N; i+=4){
6447           v1 = vp[i+k];
6448           for (j=k; j<M; j+=4){
6449             v2 = vp[i+j+VS-1];
6450             va = REALIGN_LOAD <v1,v2,rt>;
6451             vs += va;
6452             v1 = v2;
6453           }
6454         }
6455     } */
6456
6457   if (DR_IS_READ (dr))
6458     {
6459       bool is_packed = false;
6460       tree type = (TREE_TYPE (DR_REF (dr)));
6461
6462       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6463           && (!targetm.vectorize.builtin_mask_for_load
6464               || targetm.vectorize.builtin_mask_for_load ()))
6465         {
6466           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6467
6468           /* If we are doing SLP then the accesses need not have the
6469              same alignment, instead it depends on the SLP group size.  */
6470           if (loop_vinfo
6471               && STMT_SLP_TYPE (stmt_info)
6472               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6473                               * (DR_GROUP_SIZE
6474                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6475                               TYPE_VECTOR_SUBPARTS (vectype)))
6476             ;
6477           else if (!loop_vinfo
6478                    || (nested_in_vect_loop
6479                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6480                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6481             return dr_explicit_realign;
6482           else
6483             return dr_explicit_realign_optimized;
6484         }
6485       if (!known_alignment_for_access_p (dr_info))
6486         is_packed = not_size_aligned (DR_REF (dr));
6487
6488       if (targetm.vectorize.support_vector_misalignment
6489             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6490         /* Can't software pipeline the loads, but can at least do them.  */
6491         return dr_unaligned_supported;
6492     }
6493   else
6494     {
6495       bool is_packed = false;
6496       tree type = (TREE_TYPE (DR_REF (dr)));
6497
6498       if (!known_alignment_for_access_p (dr_info))
6499         is_packed = not_size_aligned (DR_REF (dr));
6500
6501      if (targetm.vectorize.support_vector_misalignment
6502            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6503        return dr_unaligned_supported;
6504     }
6505
6506   /* Unsupported.  */
6507   return dr_unaligned_unsupported;
6508 }