gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static opt_result
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return opt_result::failure_at (vect_location,
 166                                    "will not create alias checks, as"
 167                                    " --param vect-max-version-for-alias-checks"
 168                                    " == 0\n");
 169
 170   opt_result res
 171     = runtime_alias_check_p (ddr, loop,
 172                              optimize_loop_nest_for_speed_p (loop));
 173   if (!res)
 174     return res;
 175
 176   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 177   return opt_result::success ();
 178 }
 179
 180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 181
 182 static void
 183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 184 {
 185   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 186   for (unsigned int i = 0; i < checks.length(); ++i)
 187     if (checks[i] == value)
 188       return;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "need run-time check that %T is nonzero\n",
 193                      value);
 194   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 195 }
 196
 197 /* Return true if we know that the order of vectorized DR_INFO_A and
 198    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 199    DR_INFO_B.  At least one of the accesses is a write.  */
 200
 201 static bool
 202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 203 {
 204   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 205   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 206
 207   /* Single statements are always kept in their original order.  */
 208   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 209       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 210     return true;
 211
 212   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 213      group are emitted at the position of the last scalar load and all
 214      stores in a group are emitted at the position of the last scalar store.
 215      Compute that position and check whether the resulting order matches
 216      the current one.  */
 217   stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 218   if (last_a)
 219     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 220          s = DR_GROUP_NEXT_ELEMENT (s))
 221       last_a = get_later_stmt (last_a, s);
 222   else
 223     last_a = stmtinfo_a;
 224   stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 225   if (last_b)
 226     for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 227          s = DR_GROUP_NEXT_ELEMENT (s))
 228       last_b = get_later_stmt (last_b, s);
 229   else
 230     last_b = stmtinfo_b;
 231   return ((get_later_stmt (last_a, last_b) == last_a)
 232           == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
 233 }
 234
 235 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 236    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 237    distances.  These distances are conservatively correct but they don't
 238    reflect a guaranteed dependence.
 239
 240    Return true if this function does all the work necessary to avoid
 241    an alias or false if the caller should use the dependence distances
 242    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 243    the depth of the loop described by LOOP_VINFO and the other arguments
 244    are as for vect_analyze_data_ref_dependence.  */
 245
 246 static bool
 247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 248                                        loop_vec_info loop_vinfo,
 249                                        int loop_depth, unsigned int *max_vf)
 250 {
 251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 252   lambda_vector dist_v;
 253   unsigned int i;
 254   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 255     {
 256       int dist = dist_v[loop_depth];
 257       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 258         {
 259           /* If the user asserted safelen >= DIST consecutive iterations
 260              can be executed concurrently, assume independence.
 261
 262              ??? An alternative would be to add the alias check even
 263              in this case, and vectorize the fallback loop with the
 264              maximum VF set to safelen.  However, if the user has
 265              explicitly given a length, it's less likely that that
 266              would be a win.  */
 267           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 268             {
 269               if ((unsigned int) loop->safelen < *max_vf)
 270                 *max_vf = loop->safelen;
 271               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 272               continue;
 273             }
 274
 275           /* For dependence distances of 2 or more, we have the option
 276              of limiting VF or checking for an alias at runtime.
 277              Prefer to check at runtime if we can, to avoid limiting
 278              the VF unnecessarily when the bases are in fact independent.
 279
 280              Note that the alias checks will be removed if the VF ends up
 281              being small enough.  */
 282           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 283           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 284           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 285                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 286                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 287         }
 288     }
 289   return true;
 290 }
 291
 292
 293 /* Function vect_analyze_data_ref_dependence.
 294
 295    FIXME: I needed to change the sense of the returned flag.
 296
 297    Return FALSE if there (might) exist a dependence between a memory-reference
 298    DRA and a memory-reference DRB.  When versioning for alias may check a
 299    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 300    the data dependence.  */
 301
 302 static opt_result
 303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 304                                   loop_vec_info loop_vinfo,
 305                                   unsigned int *max_vf)
 306 {
 307   unsigned int i;
 308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   struct data_reference *dra = DDR_A (ddr);
 310   struct data_reference *drb = DDR_B (ddr);
 311   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 312   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 313   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 314   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 315   lambda_vector dist_v;
 316   unsigned int loop_depth;
 317
 318   /* In loop analysis all data references should be vectorizable.  */
 319   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 320       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 321     gcc_unreachable ();
 322
 323   /* Independent data accesses.  */
 324   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 325     return opt_result::success ();
 326
 327   if (dra == drb
 328       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 329     return opt_result::success ();
 330
 331   /* We do not have to consider dependences between accesses that belong
 332      to the same group, unless the stride could be smaller than the
 333      group size.  */
 334   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 335       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 336           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 337       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 338     return opt_result::success ();
 339
 340   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 341      least two scalar iterations, there is always also a true dependence.
 342      As the vectorizer does not re-order loads and stores we can ignore
 343      the anti-dependence if TBAA can disambiguate both DRs similar to the
 344      case with known negative distance anti-dependences (positive
 345      distance anti-dependences would violate TBAA constraints).  */
 346   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 347        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 348       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 349                                  get_alias_set (DR_REF (drb))))
 350     return opt_result::success ();
 351
 352   /* Unknown data dependence.  */
 353   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 354     {
 355       /* If user asserted safelen consecutive iterations can be
 356          executed concurrently, assume independence.  */
 357       if (loop->safelen >= 2)
 358         {
 359           if ((unsigned int) loop->safelen < *max_vf)
 360             *max_vf = loop->safelen;
 361           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 362           return opt_result::success ();
 363         }
 364
 365       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 366           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 367         return opt_result::failure_at
 368           (stmtinfo_a->stmt,
 369            "versioning for alias not supported for: "
 370            "can't determine dependence between %T and %T\n",
 371            DR_REF (dra), DR_REF (drb));
 372
 373       if (dump_enabled_p ())
 374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 375                          "versioning for alias required: "
 376                          "can't determine dependence between %T and %T\n",
 377                          DR_REF (dra), DR_REF (drb));
 378
 379       /* Add to list of ddrs that need to be tested at run-time.  */
 380       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 381     }
 382
 383   /* Known data dependence.  */
 384   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 385     {
 386       /* If user asserted safelen consecutive iterations can be
 387          executed concurrently, assume independence.  */
 388       if (loop->safelen >= 2)
 389         {
 390           if ((unsigned int) loop->safelen < *max_vf)
 391             *max_vf = loop->safelen;
 392           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 393           return opt_result::success ();
 394         }
 395
 396       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 397           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 398         return opt_result::failure_at
 399           (stmtinfo_a->stmt,
 400            "versioning for alias not supported for: "
 401            "bad dist vector for %T and %T\n",
 402            DR_REF (dra), DR_REF (drb));
 403
 404       if (dump_enabled_p ())
 405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 406                          "versioning for alias required: "
 407                          "bad dist vector for %T and %T\n",
 408                          DR_REF (dra), DR_REF (drb));
 409       /* Add to list of ddrs that need to be tested at run-time.  */
 410       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 411     }
 412
 413   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 414
 415   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 416       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 417                                                 loop_depth, max_vf))
 418     return opt_result::success ();
 419
 420   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 421     {
 422       int dist = dist_v[loop_depth];
 423
 424       if (dump_enabled_p ())
 425         dump_printf_loc (MSG_NOTE, vect_location,
 426                          "dependence distance  = %d.\n", dist);
 427
 428       if (dist == 0)
 429         {
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_NOTE, vect_location,
 432                              "dependence distance == 0 between %T and %T\n",
 433                              DR_REF (dra), DR_REF (drb));
 434
 435           /* When we perform grouped accesses and perform implicit CSE
 436              by detecting equal accesses and doing disambiguation with
 437              runtime alias tests like for
 438                 .. = a[i];
 439                 .. = a[i+1];
 440                 a[i] = ..;
 441                 a[i+1] = ..;
 442                 *p = ..;
 443                 .. = a[i];
 444                 .. = a[i+1];
 445              where we will end up loading { a[i], a[i+1] } once, make
 446              sure that inserting group loads before the first load and
 447              stores after the last store will do the right thing.
 448              Similar for groups like
 449                 a[i] = ...;
 450                 ... = a[i];
 451                 a[i+1] = ...;
 452              where loads from the group interleave with the store.  */
 453           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 454             return opt_result::failure_at (stmtinfo_a->stmt,
 455                                            "READ_WRITE dependence"
 456                                            " in interleaving.\n");
 457
 458           if (loop->safelen < 2)
 459             {
 460               tree indicator = dr_zero_step_indicator (dra);
 461               if (!indicator || integer_zerop (indicator))
 462                 return opt_result::failure_at (stmtinfo_a->stmt,
 463                                                "access also has a zero step\n");
 464               else if (TREE_CODE (indicator) != INTEGER_CST)
 465                 vect_check_nonzero_value (loop_vinfo, indicator);
 466             }
 467           continue;
 468         }
 469
 470       if (dist > 0 && DDR_REVERSED_P (ddr))
 471         {
 472           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 473              reversed (to make distance vector positive), and the actual
 474              distance is negative.  */
 475           if (dump_enabled_p ())
 476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                              "dependence distance negative.\n");
 478           /* Record a negative dependence distance to later limit the
 479              amount of stmt copying / unrolling we can perform.
 480              Only need to handle read-after-write dependence.  */
 481           if (DR_IS_READ (drb)
 482               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 483                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 484             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 485           continue;
 486         }
 487
 488       unsigned int abs_dist = abs (dist);
 489       if (abs_dist >= 2 && abs_dist < *max_vf)
 490         {
 491           /* The dependence distance requires reduction of the maximal
 492              vectorization factor.  */
 493           *max_vf = abs (dist);
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_NOTE, vect_location,
 496                              "adjusting maximal vectorization factor to %i\n",
 497                              *max_vf);
 498         }
 499
 500       if (abs_dist >= *max_vf)
 501         {
 502           /* Dependence distance does not create dependence, as far as
 503              vectorization is concerned, in this case.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_NOTE, vect_location,
 506                              "dependence distance >= VF.\n");
 507           continue;
 508         }
 509
 510       return opt_result::failure_at (stmtinfo_a->stmt,
 511                                      "not vectorized, possible dependence "
 512                                      "between data-refs %T and %T\n",
 513                                      DR_REF (dra), DR_REF (drb));
 514     }
 515
 516   return opt_result::success ();
 517 }
 518
 519 /* Function vect_analyze_data_ref_dependences.
 520
 521    Examine all the data references in the loop, and make sure there do not
 522    exist any data dependences between them.  Set *MAX_VF according to
 523    the maximum vectorization factor the data dependences allow.  */
 524
 525 opt_result
 526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 527                                    unsigned int *max_vf)
 528 {
 529   unsigned int i;
 530   struct data_dependence_relation *ddr;
 531
 532   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 533
 534   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 535     {
 536       LOOP_VINFO_DDRS (loop_vinfo)
 537         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 538                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 539       /* We need read-read dependences to compute
 540          STMT_VINFO_SAME_ALIGN_REFS.  */
 541       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 542                                           &LOOP_VINFO_DDRS (loop_vinfo),
 543                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 544                                           true);
 545       gcc_assert (res);
 546     }
 547
 548   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 549
 550   /* For epilogues we either have no aliases or alias versioning
 551      was applied to original loop.  Therefore we may just get max_vf
 552      using VF of original loop.  */
 553   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 554     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 555   else
 556     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 557       {
 558         opt_result res
 559           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 560         if (!res)
 561           return res;
 562       }
 563
 564   return opt_result::success ();
 565 }
 566
 567
 568 /* Function vect_slp_analyze_data_ref_dependence.
 569
 570    Return TRUE if there (might) exist a dependence between a memory-reference
 571    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 572    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 573    according to the data dependence.  */
 574
 575 static bool
 576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 577                                       struct data_dependence_relation *ddr)
 578 {
 579   struct data_reference *dra = DDR_A (ddr);
 580   struct data_reference *drb = DDR_B (ddr);
 581   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 582   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 583
 584   /* We need to check dependences of statements marked as unvectorizable
 585      as well, they still can prohibit vectorization.  */
 586
 587   /* Independent data accesses.  */
 588   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 589     return false;
 590
 591   if (dra == drb)
 592     return false;
 593
 594   /* Read-read is OK.  */
 595   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 596     return false;
 597
 598   /* If dra and drb are part of the same interleaving chain consider
 599      them independent.  */
 600   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 601       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 602           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 603     return false;
 604
 605   /* Unknown data dependence.  */
 606   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 607     {
 608       if  (dump_enabled_p ())
 609         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 610                          "can't determine dependence between %T and %T\n",
 611                          DR_REF (dra), DR_REF (drb));
 612     }
 613   else if (dump_enabled_p ())
 614     dump_printf_loc (MSG_NOTE, vect_location,
 615                      "determined dependence between %T and %T\n",
 616                      DR_REF (dra), DR_REF (drb));
 617
 618   return true;
 619 }
 620
 621
 622 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 623    contain the vector of scalar stores of this instance if we are
 624    disambiguating the loads.  */
 625
 626 static bool
 627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 628                                    vec<stmt_vec_info> stores,
 629                                    stmt_vec_info last_store_info)
 630 {
 631   /* This walks over all stmts involved in the SLP load/store done
 632      in NODE verifying we can sink them up to the last stmt in the
 633      group.  */
 634   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 635   vec_info *vinfo = last_access_info->vinfo;
 636   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 637     {
 638       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 639       if (access_info == last_access_info)
 640         continue;
 641       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 642       ao_ref ref;
 643       bool ref_initialized_p = false;
 644       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 645            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 646         {
 647           gimple *stmt = gsi_stmt (gsi);
 648           if (! gimple_vuse (stmt)
 649               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 650             continue;
 651
 652           /* If we couldn't record a (single) data reference for this
 653              stmt we have to resort to the alias oracle.  */
 654           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 655           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 656           if (!dr_b)
 657             {
 658               /* We are moving a store or sinking a load - this means
 659                  we cannot use TBAA for disambiguation.  */
 660               if (!ref_initialized_p)
 661                 ao_ref_init (&ref, DR_REF (dr_a));
 662               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 663                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 664                 return false;
 665               continue;
 666             }
 667
 668           bool dependent = false;
 669           /* If we run into a store of this same instance (we've just
 670              marked those) then delay dependence checking until we run
 671              into the last store because this is where it will have
 672              been sunk to (and we verify if we can do that as well).  */
 673           if (gimple_visited_p (stmt))
 674             {
 675               if (stmt_info != last_store_info)
 676                 continue;
 677               unsigned i;
 678               stmt_vec_info store_info;
 679               FOR_EACH_VEC_ELT (stores, i, store_info)
 680                 {
 681                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 682                   ddr_p ddr = initialize_data_dependence_relation
 683                                 (dr_a, store_dr, vNULL);
 684                   dependent
 685                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 686                   free_dependence_relation (ddr);
 687                   if (dependent)
 688                     break;
 689                 }
 690             }
 691           else
 692             {
 693               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 694                                                                dr_b, vNULL);
 695               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 696               free_dependence_relation (ddr);
 697             }
 698           if (dependent)
 699             return false;
 700         }
 701     }
 702   return true;
 703 }
 704
 705
 706 /* Function vect_analyze_data_ref_dependences.
 707
 708    Examine all the data references in the basic-block, and make sure there
 709    do not exist any data dependences between them.  Set *MAX_VF according to
 710    the maximum vectorization factor the data dependences allow.  */
 711
 712 bool
 713 vect_slp_analyze_instance_dependence (slp_instance instance)
 714 {
 715   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 716
 717   /* The stores of this instance are at the root of the SLP tree.  */
 718   slp_tree store = SLP_INSTANCE_TREE (instance);
 719   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 720     store = NULL;
 721
 722   /* Verify we can sink stores to the vectorized stmt insert location.  */
 723   stmt_vec_info last_store_info = NULL;
 724   if (store)
 725     {
 726       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 727         return false;
 728
 729       /* Mark stores in this instance and remember the last one.  */
 730       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 731       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 732         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 733     }
 734
 735   bool res = true;
 736
 737   /* Verify we can sink loads to the vectorized stmt insert location,
 738      special-casing stores of this instance.  */
 739   slp_tree load;
 740   unsigned int i;
 741   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 742     if (! vect_slp_analyze_node_dependences (instance, load,
 743                                              store
 744                                              ? SLP_TREE_SCALAR_STMTS (store)
 745                                              : vNULL, last_store_info))
 746       {
 747         res = false;
 748         break;
 749       }
 750
 751   /* Unset the visited flag.  */
 752   if (store)
 753     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 754       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 755
 756   return res;
 757 }
 758
 759 /* Record the base alignment guarantee given by DRB, which occurs
 760    in STMT_INFO.  */
 761
 762 static void
 763 vect_record_base_alignment (stmt_vec_info stmt_info,
 764                             innermost_loop_behavior *drb)
 765 {
 766   vec_info *vinfo = stmt_info->vinfo;
 767   bool existed;
 768   innermost_loop_behavior *&entry
 769     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 770   if (!existed || entry->base_alignment < drb->base_alignment)
 771     {
 772       entry = drb;
 773       if (dump_enabled_p ())
 774         dump_printf_loc (MSG_NOTE, vect_location,
 775                          "recording new base alignment for %T\n"
 776                          "  alignment:    %d\n"
 777                          "  misalignment: %d\n"
 778                          "  based on:     %G",
 779                          drb->base_address,
 780                          drb->base_alignment,
 781                          drb->base_misalignment,
 782                          stmt_info->stmt);
 783     }
 784 }
 785
 786 /* If the region we're going to vectorize is reached, all unconditional
 787    data references occur at least once.  We can therefore pool the base
 788    alignment guarantees from each unconditional reference.  Do this by
 789    going through all the data references in VINFO and checking whether
 790    the containing statement makes the reference unconditionally.  If so,
 791    record the alignment of the base address in VINFO so that it can be
 792    used for all other references with the same base.  */
 793
 794 void
 795 vect_record_base_alignments (vec_info *vinfo)
 796 {
 797   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 798   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 799   data_reference *dr;
 800   unsigned int i;
 801   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 802     {
 803       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 804       stmt_vec_info stmt_info = dr_info->stmt;
 805       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 806           && STMT_VINFO_VECTORIZABLE (stmt_info)
 807           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 808         {
 809           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 810
 811           /* If DR is nested in the loop that is being vectorized, we can also
 812              record the alignment of the base wrt the outer loop.  */
 813           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 814             vect_record_base_alignment
 815               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 816         }
 817     }
 818 }
 819
 820 /* Return the target alignment for the vectorized form of DR_INFO.  */
 821
 822 static unsigned int
 823 vect_calculate_target_alignment (dr_vec_info *dr_info)
 824 {
 825   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 826   return targetm.vectorize.preferred_vector_alignment (vectype);
 827 }
 828
 829 /* Function vect_compute_data_ref_alignment
 830
 831    Compute the misalignment of the data reference DR_INFO.
 832
 833    Output:
 834    1. DR_MISALIGNMENT (DR_INFO) is defined.
 835
 836    FOR NOW: No analysis is actually performed. Misalignment is calculated
 837    only for trivial cases. TODO.  */
 838
 839 static void
 840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 841 {
 842   stmt_vec_info stmt_info = dr_info->stmt;
 843   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 844   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 845   struct loop *loop = NULL;
 846   tree ref = DR_REF (dr_info->dr);
 847   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 848
 849   if (dump_enabled_p ())
 850     dump_printf_loc (MSG_NOTE, vect_location,
 851                      "vect_compute_data_ref_alignment:\n");
 852
 853   if (loop_vinfo)
 854     loop = LOOP_VINFO_LOOP (loop_vinfo);
 855
 856   /* Initialize misalignment to unknown.  */
 857   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 858
 859   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 860     return;
 861
 862   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 863   bool step_preserves_misalignment_p;
 864
 865   unsigned HOST_WIDE_INT vector_alignment
 866     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 867   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 868
 869   /* No step for BB vectorization.  */
 870   if (!loop)
 871     {
 872       gcc_assert (integer_zerop (drb->step));
 873       step_preserves_misalignment_p = true;
 874     }
 875
 876   /* In case the dataref is in an inner-loop of the loop that is being
 877      vectorized (LOOP), we use the base and misalignment information
 878      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 879      stays the same throughout the execution of the inner-loop, which is why
 880      we have to check that the stride of the dataref in the inner-loop evenly
 881      divides by the vector alignment.  */
 882   else if (nested_in_vect_loop_p (loop, stmt_info))
 883     {
 884       step_preserves_misalignment_p
 885         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 886
 887       if (dump_enabled_p ())
 888         {
 889           if (step_preserves_misalignment_p)
 890             dump_printf_loc (MSG_NOTE, vect_location,
 891                              "inner step divides the vector alignment.\n");
 892           else
 893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 894                              "inner step doesn't divide the vector"
 895                              " alignment.\n");
 896         }
 897     }
 898
 899   /* Similarly we can only use base and misalignment information relative to
 900      an innermost loop if the misalignment stays the same throughout the
 901      execution of the loop.  As above, this is the case if the stride of
 902      the dataref evenly divides by the alignment.  */
 903   else
 904     {
 905       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 906       step_preserves_misalignment_p
 907         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 908
 909       if (!step_preserves_misalignment_p && dump_enabled_p ())
 910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                          "step doesn't divide the vector alignment.\n");
 912     }
 913
 914   unsigned int base_alignment = drb->base_alignment;
 915   unsigned int base_misalignment = drb->base_misalignment;
 916
 917   /* Calculate the maximum of the pooled base address alignment and the
 918      alignment that we can compute for DR itself.  */
 919   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 920   if (entry && base_alignment < (*entry)->base_alignment)
 921     {
 922       base_alignment = (*entry)->base_alignment;
 923       base_misalignment = (*entry)->base_misalignment;
 924     }
 925
 926   if (drb->offset_alignment < vector_alignment
 927       || !step_preserves_misalignment_p
 928       /* We need to know whether the step wrt the vectorized loop is
 929          negative when computing the starting misalignment below.  */
 930       || TREE_CODE (drb->step) != INTEGER_CST)
 931     {
 932       if (dump_enabled_p ())
 933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 934                          "Unknown alignment for access: %T\n", ref);
 935       return;
 936     }
 937
 938   if (base_alignment < vector_alignment)
 939     {
 940       unsigned int max_alignment;
 941       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 942       if (max_alignment < vector_alignment
 943           || !vect_can_force_dr_alignment_p (base,
 944                                              vector_alignment * BITS_PER_UNIT))
 945         {
 946           if (dump_enabled_p ())
 947             dump_printf_loc (MSG_NOTE, vect_location,
 948                              "can't force alignment of ref: %T\n", ref);
 949           return;
 950         }
 951
 952       /* Force the alignment of the decl.
 953          NOTE: This is the only change to the code we make during
 954          the analysis phase, before deciding to vectorize the loop.  */
 955       if (dump_enabled_p ())
 956         dump_printf_loc (MSG_NOTE, vect_location,
 957                          "force alignment of %T\n", ref);
 958
 959       dr_info->base_decl = base;
 960       dr_info->base_misaligned = true;
 961       base_misalignment = 0;
 962     }
 963   poly_int64 misalignment
 964     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 965
 966   /* If this is a backward running DR then first access in the larger
 967      vectype actually is N-1 elements before the address in the DR.
 968      Adjust misalign accordingly.  */
 969   if (tree_int_cst_sgn (drb->step) < 0)
 970     /* PLUS because STEP is negative.  */
 971     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 972                      * TREE_INT_CST_LOW (drb->step));
 973
 974   unsigned int const_misalignment;
 975   if (!known_misalignment (misalignment, vector_alignment,
 976                            &const_misalignment))
 977     {
 978       if (dump_enabled_p ())
 979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                          "Non-constant misalignment for access: %T\n", ref);
 981       return;
 982     }
 983
 984   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
 985
 986   if (dump_enabled_p ())
 987     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 988                      "misalign = %d bytes of ref %T\n",
 989                      DR_MISALIGNMENT (dr_info), ref);
 990
 991   return;
 992 }
 993
 994 /* Function vect_update_misalignment_for_peel.
 995    Sets DR_INFO's misalignment
 996    - to 0 if it has the same alignment as DR_PEEL_INFO,
 997    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
 998    - to -1 (unknown) otherwise.
 999
1000    DR_INFO - the data reference whose misalignment is to be adjusted.
1001    DR_PEEL_INFO - the data reference whose misalignment is being made
1002                   zero in the vector loop by the peel.
1003    NPEEL - the number of iterations in the peel loop if the misalignment
1004            of DR_PEEL_INFO is known at compile time.  */
1005
1006 static void
1007 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1008                                    dr_vec_info *dr_peel_info, int npeel)
1009 {
1010   unsigned int i;
1011   vec<dr_p> same_aligned_drs;
1012   struct data_reference *current_dr;
1013   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1014
1015   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1016      it is aligned in the vector loop.  */
1017   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1018   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1019     {
1020       if (current_dr != dr_info->dr)
1021         continue;
1022       gcc_assert (!known_alignment_for_access_p (dr_info)
1023                   || !known_alignment_for_access_p (dr_peel_info)
1024                   || (DR_MISALIGNMENT (dr_info)
1025                       == DR_MISALIGNMENT (dr_peel_info)));
1026       SET_DR_MISALIGNMENT (dr_info, 0);
1027       return;
1028     }
1029
1030   if (known_alignment_for_access_p (dr_info)
1031       && known_alignment_for_access_p (dr_peel_info))
1032     {
1033       int misal = DR_MISALIGNMENT (dr_info);
1034       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1035       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1036       SET_DR_MISALIGNMENT (dr_info, misal);
1037       return;
1038     }
1039
1040   if (dump_enabled_p ())
1041     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1042                      "to unknown (-1).\n");
1043   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1044 }
1045
1046
1047 /* Function verify_data_ref_alignment
1048
1049    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1050
1051 static opt_result
1052 verify_data_ref_alignment (dr_vec_info *dr_info)
1053 {
1054   enum dr_alignment_support supportable_dr_alignment
1055     = vect_supportable_dr_alignment (dr_info, false);
1056   if (!supportable_dr_alignment)
1057     return opt_result::failure_at
1058       (dr_info->stmt->stmt,
1059        DR_IS_READ (dr_info->dr)
1060         ? "not vectorized: unsupported unaligned load: %T\n"
1061         : "not vectorized: unsupported unaligned store: %T\n",
1062        DR_REF (dr_info->dr));
1063
1064   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1065     dump_printf_loc (MSG_NOTE, vect_location,
1066                      "Vectorizing an unaligned access.\n");
1067
1068   return opt_result::success ();
1069 }
1070
1071 /* Function vect_verify_datarefs_alignment
1072
1073    Return TRUE if all data references in the loop can be
1074    handled with respect to alignment.  */
1075
1076 opt_result
1077 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1078 {
1079   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1080   struct data_reference *dr;
1081   unsigned int i;
1082
1083   FOR_EACH_VEC_ELT (datarefs, i, dr)
1084     {
1085       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1086       stmt_vec_info stmt_info = dr_info->stmt;
1087
1088       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1089         continue;
1090
1091       /* For interleaving, only the alignment of the first access matters.   */
1092       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1093           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1094         continue;
1095
1096       /* Strided accesses perform only component accesses, alignment is
1097          irrelevant for them.  */
1098       if (STMT_VINFO_STRIDED_P (stmt_info)
1099           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1100         continue;
1101
1102       opt_result res = verify_data_ref_alignment (dr_info);
1103       if (!res)
1104         return res;
1105     }
1106
1107   return opt_result::success ();
1108 }
1109
1110 /* Given an memory reference EXP return whether its alignment is less
1111    than its size.  */
1112
1113 static bool
1114 not_size_aligned (tree exp)
1115 {
1116   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1117     return true;
1118
1119   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1120           > get_object_alignment (exp));
1121 }
1122
1123 /* Function vector_alignment_reachable_p
1124
1125    Return true if vector alignment for DR_INFO is reachable by peeling
1126    a few loop iterations.  Return false otherwise.  */
1127
1128 static bool
1129 vector_alignment_reachable_p (dr_vec_info *dr_info)
1130 {
1131   stmt_vec_info stmt_info = dr_info->stmt;
1132   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1133
1134   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1135     {
1136       /* For interleaved access we peel only if number of iterations in
1137          the prolog loop ({VF - misalignment}), is a multiple of the
1138          number of the interleaved accesses.  */
1139       int elem_size, mis_in_elements;
1140
1141       /* FORNOW: handle only known alignment.  */
1142       if (!known_alignment_for_access_p (dr_info))
1143         return false;
1144
1145       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1146       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1147       elem_size = vector_element_size (vector_size, nelements);
1148       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1149
1150       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1151         return false;
1152     }
1153
1154   /* If misalignment is known at the compile time then allow peeling
1155      only if natural alignment is reachable through peeling.  */
1156   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1157     {
1158       HOST_WIDE_INT elmsize =
1159                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1160       if (dump_enabled_p ())
1161         {
1162           dump_printf_loc (MSG_NOTE, vect_location,
1163                            "data size = %wd. misalignment = %d.\n", elmsize,
1164                            DR_MISALIGNMENT (dr_info));
1165         }
1166       if (DR_MISALIGNMENT (dr_info) % elmsize)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "data size does not divide the misalignment.\n");
1171           return false;
1172         }
1173     }
1174
1175   if (!known_alignment_for_access_p (dr_info))
1176     {
1177       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1178       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1179       if (dump_enabled_p ())
1180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181                          "Unknown misalignment, %snaturally aligned\n",
1182                          is_packed ? "not " : "");
1183       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1184     }
1185
1186   return true;
1187 }
1188
1189
1190 /* Calculate the cost of the memory access represented by DR_INFO.  */
1191
1192 static void
1193 vect_get_data_access_cost (dr_vec_info *dr_info,
1194                            unsigned int *inside_cost,
1195                            unsigned int *outside_cost,
1196                            stmt_vector_for_cost *body_cost_vec,
1197                            stmt_vector_for_cost *prologue_cost_vec)
1198 {
1199   stmt_vec_info stmt_info = dr_info->stmt;
1200   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1201   int ncopies;
1202
1203   if (PURE_SLP_STMT (stmt_info))
1204     ncopies = 1;
1205   else
1206     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1207
1208   if (DR_IS_READ (dr_info->dr))
1209     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1210                         prologue_cost_vec, body_cost_vec, false);
1211   else
1212     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1213
1214   if (dump_enabled_p ())
1215     dump_printf_loc (MSG_NOTE, vect_location,
1216                      "vect_get_data_access_cost: inside_cost = %d, "
1217                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1218 }
1219
1220
1221 typedef struct _vect_peel_info
1222 {
1223   dr_vec_info *dr_info;
1224   int npeel;
1225   unsigned int count;
1226 } *vect_peel_info;
1227
1228 typedef struct _vect_peel_extended_info
1229 {
1230   struct _vect_peel_info peel_info;
1231   unsigned int inside_cost;
1232   unsigned int outside_cost;
1233 } *vect_peel_extended_info;
1234
1235
1236 /* Peeling hashtable helpers.  */
1237
1238 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1239 {
1240   static inline hashval_t hash (const _vect_peel_info *);
1241   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1242 };
1243
1244 inline hashval_t
1245 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1246 {
1247   return (hashval_t) peel_info->npeel;
1248 }
1249
1250 inline bool
1251 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1252 {
1253   return (a->npeel == b->npeel);
1254 }
1255
1256
1257 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1258
1259 static void
1260 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1261                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1262                           int npeel)
1263 {
1264   struct _vect_peel_info elem, *slot;
1265   _vect_peel_info **new_slot;
1266   bool supportable_dr_alignment
1267     = vect_supportable_dr_alignment (dr_info, true);
1268
1269   elem.npeel = npeel;
1270   slot = peeling_htab->find (&elem);
1271   if (slot)
1272     slot->count++;
1273   else
1274     {
1275       slot = XNEW (struct _vect_peel_info);
1276       slot->npeel = npeel;
1277       slot->dr_info = dr_info;
1278       slot->count = 1;
1279       new_slot = peeling_htab->find_slot (slot, INSERT);
1280       *new_slot = slot;
1281     }
1282
1283   if (!supportable_dr_alignment
1284       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1285     slot->count += VECT_MAX_COST;
1286 }
1287
1288
1289 /* Traverse peeling hash table to find peeling option that aligns maximum
1290    number of data accesses.  */
1291
1292 int
1293 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1294                                      _vect_peel_extended_info *max)
1295 {
1296   vect_peel_info elem = *slot;
1297
1298   if (elem->count > max->peel_info.count
1299       || (elem->count == max->peel_info.count
1300           && max->peel_info.npeel > elem->npeel))
1301     {
1302       max->peel_info.npeel = elem->npeel;
1303       max->peel_info.count = elem->count;
1304       max->peel_info.dr_info = elem->dr_info;
1305     }
1306
1307   return 1;
1308 }
1309
1310 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1311    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1312    we assume DR0_INFO's misalignment will be zero after peeling.  */
1313
1314 static void
1315 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1316                                 dr_vec_info *dr0_info,
1317                                 unsigned int *inside_cost,
1318                                 unsigned int *outside_cost,
1319                                 stmt_vector_for_cost *body_cost_vec,
1320                                 stmt_vector_for_cost *prologue_cost_vec,
1321                                 unsigned int npeel,
1322                                 bool unknown_misalignment)
1323 {
1324   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1325   unsigned i;
1326   data_reference *dr;
1327
1328   FOR_EACH_VEC_ELT (datarefs, i, dr)
1329     {
1330       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1331       stmt_vec_info stmt_info = dr_info->stmt;
1332       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1333         continue;
1334
1335       /* For interleaving, only the alignment of the first access
1336          matters.  */
1337       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1338           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1339         continue;
1340
1341       /* Strided accesses perform only component accesses, alignment is
1342          irrelevant for them.  */
1343       if (STMT_VINFO_STRIDED_P (stmt_info)
1344           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1345         continue;
1346
1347       int save_misalignment;
1348       save_misalignment = DR_MISALIGNMENT (dr_info);
1349       if (npeel == 0)
1350         ;
1351       else if (unknown_misalignment && dr_info == dr0_info)
1352         SET_DR_MISALIGNMENT (dr_info, 0);
1353       else
1354         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1355       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1356                                  body_cost_vec, prologue_cost_vec);
1357       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1358     }
1359 }
1360
1361 /* Traverse peeling hash table and calculate cost for each peeling option.
1362    Find the one with the lowest cost.  */
1363
1364 int
1365 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1366                                    _vect_peel_extended_info *min)
1367 {
1368   vect_peel_info elem = *slot;
1369   int dummy;
1370   unsigned int inside_cost = 0, outside_cost = 0;
1371   stmt_vec_info stmt_info = elem->dr_info->stmt;
1372   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1373   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1374                        epilogue_cost_vec;
1375
1376   prologue_cost_vec.create (2);
1377   body_cost_vec.create (2);
1378   epilogue_cost_vec.create (2);
1379
1380   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1381                                   &outside_cost, &body_cost_vec,
1382                                   &prologue_cost_vec, elem->npeel, false);
1383
1384   body_cost_vec.release ();
1385
1386   outside_cost += vect_get_known_peeling_cost
1387     (loop_vinfo, elem->npeel, &dummy,
1388      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1389      &prologue_cost_vec, &epilogue_cost_vec);
1390
1391   /* Prologue and epilogue costs are added to the target model later.
1392      These costs depend only on the scalar iteration cost, the
1393      number of peeling iterations finally chosen, and the number of
1394      misaligned statements.  So discard the information found here.  */
1395   prologue_cost_vec.release ();
1396   epilogue_cost_vec.release ();
1397
1398   if (inside_cost < min->inside_cost
1399       || (inside_cost == min->inside_cost
1400           && outside_cost < min->outside_cost))
1401     {
1402       min->inside_cost = inside_cost;
1403       min->outside_cost = outside_cost;
1404       min->peel_info.dr_info = elem->dr_info;
1405       min->peel_info.npeel = elem->npeel;
1406       min->peel_info.count = elem->count;
1407     }
1408
1409   return 1;
1410 }
1411
1412
1413 /* Choose best peeling option by traversing peeling hash table and either
1414    choosing an option with the lowest cost (if cost model is enabled) or the
1415    option that aligns as many accesses as possible.  */
1416
1417 static struct _vect_peel_extended_info
1418 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1419                                        loop_vec_info loop_vinfo)
1420 {
1421    struct _vect_peel_extended_info res;
1422
1423    res.peel_info.dr_info = NULL;
1424
1425    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1426      {
1427        res.inside_cost = INT_MAX;
1428        res.outside_cost = INT_MAX;
1429        peeling_htab->traverse <_vect_peel_extended_info *,
1430                                vect_peeling_hash_get_lowest_cost> (&res);
1431      }
1432    else
1433      {
1434        res.peel_info.count = 0;
1435        peeling_htab->traverse <_vect_peel_extended_info *,
1436                                vect_peeling_hash_get_most_frequent> (&res);
1437        res.inside_cost = 0;
1438        res.outside_cost = 0;
1439      }
1440
1441    return res;
1442 }
1443
1444 /* Return true if the new peeling NPEEL is supported.  */
1445
1446 static bool
1447 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1448                           unsigned npeel)
1449 {
1450   unsigned i;
1451   struct data_reference *dr = NULL;
1452   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1453   enum dr_alignment_support supportable_dr_alignment;
1454
1455   /* Ensure that all data refs can be vectorized after the peel.  */
1456   FOR_EACH_VEC_ELT (datarefs, i, dr)
1457     {
1458       int save_misalignment;
1459
1460       if (dr == dr0_info->dr)
1461         continue;
1462
1463       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1464       stmt_vec_info stmt_info = dr_info->stmt;
1465       /* For interleaving, only the alignment of the first access
1466          matters.  */
1467       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1468           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1469         continue;
1470
1471       /* Strided accesses perform only component accesses, alignment is
1472          irrelevant for them.  */
1473       if (STMT_VINFO_STRIDED_P (stmt_info)
1474           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1475         continue;
1476
1477       save_misalignment = DR_MISALIGNMENT (dr_info);
1478       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1479       supportable_dr_alignment
1480         = vect_supportable_dr_alignment (dr_info, false);
1481       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1482
1483       if (!supportable_dr_alignment)
1484         return false;
1485     }
1486
1487   return true;
1488 }
1489
1490 /* Function vect_enhance_data_refs_alignment
1491
1492    This pass will use loop versioning and loop peeling in order to enhance
1493    the alignment of data references in the loop.
1494
1495    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1496    original loop is to be vectorized.  Any other loops that are created by
1497    the transformations performed in this pass - are not supposed to be
1498    vectorized.  This restriction will be relaxed.
1499
1500    This pass will require a cost model to guide it whether to apply peeling
1501    or versioning or a combination of the two.  For example, the scheme that
1502    intel uses when given a loop with several memory accesses, is as follows:
1503    choose one memory access ('p') which alignment you want to force by doing
1504    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1505    other accesses are not necessarily aligned, or (2) use loop versioning to
1506    generate one loop in which all accesses are aligned, and another loop in
1507    which only 'p' is necessarily aligned.
1508
1509    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1510    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1511    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1512
1513    Devising a cost model is the most critical aspect of this work.  It will
1514    guide us on which access to peel for, whether to use loop versioning, how
1515    many versions to create, etc.  The cost model will probably consist of
1516    generic considerations as well as target specific considerations (on
1517    powerpc for example, misaligned stores are more painful than misaligned
1518    loads).
1519
1520    Here are the general steps involved in alignment enhancements:
1521
1522      -- original loop, before alignment analysis:
1523         for (i=0; i<N; i++){
1524           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1525           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1526         }
1527
1528      -- After vect_compute_data_refs_alignment:
1529         for (i=0; i<N; i++){
1530           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1531           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1532         }
1533
1534      -- Possibility 1: we do loop versioning:
1535      if (p is aligned) {
1536         for (i=0; i<N; i++){    # loop 1A
1537           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1538           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1539         }
1540      }
1541      else {
1542         for (i=0; i<N; i++){    # loop 1B
1543           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1544           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1545         }
1546      }
1547
1548      -- Possibility 2: we do loop peeling:
1549      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1550         x = q[i];
1551         p[i] = y;
1552      }
1553      for (i = 3; i < N; i++){   # loop 2A
1554         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1555         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1556      }
1557
1558      -- Possibility 3: combination of loop peeling and versioning:
1559      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1560         x = q[i];
1561         p[i] = y;
1562      }
1563      if (p is aligned) {
1564         for (i = 3; i<N; i++){  # loop 3A
1565           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1566           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1567         }
1568      }
1569      else {
1570         for (i = 3; i<N; i++){  # loop 3B
1571           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1572           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1573         }
1574      }
1575
1576      These loops are later passed to loop_transform to be vectorized.  The
1577      vectorizer will use the alignment information to guide the transformation
1578      (whether to generate regular loads/stores, or with special handling for
1579      misalignment).  */
1580
1581 opt_result
1582 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1583 {
1584   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1585   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1586   enum dr_alignment_support supportable_dr_alignment;
1587   dr_vec_info *first_store = NULL;
1588   dr_vec_info *dr0_info = NULL;
1589   struct data_reference *dr;
1590   unsigned int i, j;
1591   bool do_peeling = false;
1592   bool do_versioning = false;
1593   unsigned int npeel = 0;
1594   bool one_misalignment_known = false;
1595   bool one_misalignment_unknown = false;
1596   bool one_dr_unsupportable = false;
1597   dr_vec_info *unsupportable_dr_info = NULL;
1598   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1599   unsigned possible_npeel_number = 1;
1600   tree vectype;
1601   unsigned int mis, same_align_drs_max = 0;
1602   hash_table<peel_info_hasher> peeling_htab (1);
1603
1604   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1605
1606   /* Reset data so we can safely be called multiple times.  */
1607   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1608   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1609
1610   /* While cost model enhancements are expected in the future, the high level
1611      view of the code at this time is as follows:
1612
1613      A) If there is a misaligned access then see if peeling to align
1614         this access can make all data references satisfy
1615         vect_supportable_dr_alignment.  If so, update data structures
1616         as needed and return true.
1617
1618      B) If peeling wasn't possible and there is a data reference with an
1619         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1620         then see if loop versioning checks can be used to make all data
1621         references satisfy vect_supportable_dr_alignment.  If so, update
1622         data structures as needed and return true.
1623
1624      C) If neither peeling nor versioning were successful then return false if
1625         any data reference does not satisfy vect_supportable_dr_alignment.
1626
1627      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1628
1629      Note, Possibility 3 above (which is peeling and versioning together) is not
1630      being done at this time.  */
1631
1632   /* (1) Peeling to force alignment.  */
1633
1634   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1635      Considerations:
1636      + How many accesses will become aligned due to the peeling
1637      - How many accesses will become unaligned due to the peeling,
1638        and the cost of misaligned accesses.
1639      - The cost of peeling (the extra runtime checks, the increase
1640        in code size).  */
1641
1642   FOR_EACH_VEC_ELT (datarefs, i, dr)
1643     {
1644       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1645       stmt_vec_info stmt_info = dr_info->stmt;
1646
1647       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1648         continue;
1649
1650       /* For interleaving, only the alignment of the first access
1651          matters.  */
1652       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1653           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1654         continue;
1655
1656       /* For scatter-gather or invariant accesses there is nothing
1657          to enhance.  */
1658       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1659           || integer_zerop (DR_STEP (dr)))
1660         continue;
1661
1662       /* Strided accesses perform only component accesses, alignment is
1663          irrelevant for them.  */
1664       if (STMT_VINFO_STRIDED_P (stmt_info)
1665           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1666         continue;
1667
1668       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1669       do_peeling = vector_alignment_reachable_p (dr_info);
1670       if (do_peeling)
1671         {
1672           if (known_alignment_for_access_p (dr_info))
1673             {
1674               unsigned int npeel_tmp = 0;
1675               bool negative = tree_int_cst_compare (DR_STEP (dr),
1676                                                     size_zero_node) < 0;
1677
1678               vectype = STMT_VINFO_VECTYPE (stmt_info);
1679               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1680               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1681               mis = (negative
1682                      ? DR_MISALIGNMENT (dr_info)
1683                      : -DR_MISALIGNMENT (dr_info));
1684               if (DR_MISALIGNMENT (dr_info) != 0)
1685                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1686
1687               /* For multiple types, it is possible that the bigger type access
1688                  will have more than one peeling option.  E.g., a loop with two
1689                  types: one of size (vector size / 4), and the other one of
1690                  size (vector size / 8).  Vectorization factor will 8.  If both
1691                  accesses are misaligned by 3, the first one needs one scalar
1692                  iteration to be aligned, and the second one needs 5.  But the
1693                  first one will be aligned also by peeling 5 scalar
1694                  iterations, and in that case both accesses will be aligned.
1695                  Hence, except for the immediate peeling amount, we also want
1696                  to try to add full vector size, while we don't exceed
1697                  vectorization factor.
1698                  We do this automatically for cost model, since we calculate
1699                  cost for every peeling option.  */
1700               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1701                 {
1702                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1703                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1704                   possible_npeel_number
1705                     = vect_get_num_vectors (nscalars, vectype);
1706
1707                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1708                      allow peeling NELEMENTS.  */
1709                   if (DR_MISALIGNMENT (dr_info) == 0)
1710                     possible_npeel_number++;
1711                 }
1712
1713               /* Save info about DR in the hash table.  Also include peeling
1714                  amounts according to the explanation above.  */
1715               for (j = 0; j < possible_npeel_number; j++)
1716                 {
1717                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1718                                             dr_info, npeel_tmp);
1719                   npeel_tmp += target_align / dr_size;
1720                 }
1721
1722               one_misalignment_known = true;
1723             }
1724           else
1725             {
1726               /* If we don't know any misalignment values, we prefer
1727                  peeling for data-ref that has the maximum number of data-refs
1728                  with the same alignment, unless the target prefers to align
1729                  stores over load.  */
1730               unsigned same_align_drs
1731                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1732               if (!dr0_info
1733                   || same_align_drs_max < same_align_drs)
1734                 {
1735                   same_align_drs_max = same_align_drs;
1736                   dr0_info = dr_info;
1737                 }
1738               /* For data-refs with the same number of related
1739                  accesses prefer the one where the misalign
1740                  computation will be invariant in the outermost loop.  */
1741               else if (same_align_drs_max == same_align_drs)
1742                 {
1743                   struct loop *ivloop0, *ivloop;
1744                   ivloop0 = outermost_invariant_loop_for_expr
1745                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1746                   ivloop = outermost_invariant_loop_for_expr
1747                     (loop, DR_BASE_ADDRESS (dr));
1748                   if ((ivloop && !ivloop0)
1749                       || (ivloop && ivloop0
1750                           && flow_loop_nested_p (ivloop, ivloop0)))
1751                     dr0_info = dr_info;
1752                 }
1753
1754               one_misalignment_unknown = true;
1755
1756               /* Check for data refs with unsupportable alignment that
1757                  can be peeled.  */
1758               if (!supportable_dr_alignment)
1759               {
1760                 one_dr_unsupportable = true;
1761                 unsupportable_dr_info = dr_info;
1762               }
1763
1764               if (!first_store && DR_IS_WRITE (dr))
1765                 first_store = dr_info;
1766             }
1767         }
1768       else
1769         {
1770           if (!aligned_access_p (dr_info))
1771             {
1772               if (dump_enabled_p ())
1773                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1774                                  "vector alignment may not be reachable\n");
1775               break;
1776             }
1777         }
1778     }
1779
1780   /* Check if we can possibly peel the loop.  */
1781   if (!vect_can_advance_ivs_p (loop_vinfo)
1782       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1783       || loop->inner)
1784     do_peeling = false;
1785
1786   struct _vect_peel_extended_info peel_for_known_alignment;
1787   struct _vect_peel_extended_info peel_for_unknown_alignment;
1788   struct _vect_peel_extended_info best_peel;
1789
1790   peel_for_unknown_alignment.inside_cost = INT_MAX;
1791   peel_for_unknown_alignment.outside_cost = INT_MAX;
1792   peel_for_unknown_alignment.peel_info.count = 0;
1793
1794   if (do_peeling
1795       && one_misalignment_unknown)
1796     {
1797       /* Check if the target requires to prefer stores over loads, i.e., if
1798          misaligned stores are more expensive than misaligned loads (taking
1799          drs with same alignment into account).  */
1800       unsigned int load_inside_cost = 0;
1801       unsigned int load_outside_cost = 0;
1802       unsigned int store_inside_cost = 0;
1803       unsigned int store_outside_cost = 0;
1804       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1805
1806       stmt_vector_for_cost dummy;
1807       dummy.create (2);
1808       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1809                                       &load_inside_cost,
1810                                       &load_outside_cost,
1811                                       &dummy, &dummy, estimated_npeels, true);
1812       dummy.release ();
1813
1814       if (first_store)
1815         {
1816           dummy.create (2);
1817           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1818                                           &store_inside_cost,
1819                                           &store_outside_cost,
1820                                           &dummy, &dummy,
1821                                           estimated_npeels, true);
1822           dummy.release ();
1823         }
1824       else
1825         {
1826           store_inside_cost = INT_MAX;
1827           store_outside_cost = INT_MAX;
1828         }
1829
1830       if (load_inside_cost > store_inside_cost
1831           || (load_inside_cost == store_inside_cost
1832               && load_outside_cost > store_outside_cost))
1833         {
1834           dr0_info = first_store;
1835           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1836           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1837         }
1838       else
1839         {
1840           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1841           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1842         }
1843
1844       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1845       prologue_cost_vec.create (2);
1846       epilogue_cost_vec.create (2);
1847
1848       int dummy2;
1849       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1850         (loop_vinfo, estimated_npeels, &dummy2,
1851          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1852          &prologue_cost_vec, &epilogue_cost_vec);
1853
1854       prologue_cost_vec.release ();
1855       epilogue_cost_vec.release ();
1856
1857       peel_for_unknown_alignment.peel_info.count = 1
1858         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1859     }
1860
1861   peel_for_unknown_alignment.peel_info.npeel = 0;
1862   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1863
1864   best_peel = peel_for_unknown_alignment;
1865
1866   peel_for_known_alignment.inside_cost = INT_MAX;
1867   peel_for_known_alignment.outside_cost = INT_MAX;
1868   peel_for_known_alignment.peel_info.count = 0;
1869   peel_for_known_alignment.peel_info.dr_info = NULL;
1870
1871   if (do_peeling && one_misalignment_known)
1872     {
1873       /* Peeling is possible, but there is no data access that is not supported
1874          unless aligned.  So we try to choose the best possible peeling from
1875          the hash table.  */
1876       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1877         (&peeling_htab, loop_vinfo);
1878     }
1879
1880   /* Compare costs of peeling for known and unknown alignment. */
1881   if (peel_for_known_alignment.peel_info.dr_info != NULL
1882       && peel_for_unknown_alignment.inside_cost
1883       >= peel_for_known_alignment.inside_cost)
1884     {
1885       best_peel = peel_for_known_alignment;
1886
1887       /* If the best peeling for known alignment has NPEEL == 0, perform no
1888          peeling at all except if there is an unsupportable dr that we can
1889          align.  */
1890       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1891         do_peeling = false;
1892     }
1893
1894   /* If there is an unsupportable data ref, prefer this over all choices so far
1895      since we'd have to discard a chosen peeling except when it accidentally
1896      aligned the unsupportable data ref.  */
1897   if (one_dr_unsupportable)
1898     dr0_info = unsupportable_dr_info;
1899   else if (do_peeling)
1900     {
1901       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1902          TODO: Use nopeel_outside_cost or get rid of it?  */
1903       unsigned nopeel_inside_cost = 0;
1904       unsigned nopeel_outside_cost = 0;
1905
1906       stmt_vector_for_cost dummy;
1907       dummy.create (2);
1908       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1909                                       &nopeel_outside_cost, &dummy, &dummy,
1910                                       0, false);
1911       dummy.release ();
1912
1913       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1914          costs will be recorded.  */
1915       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1916       prologue_cost_vec.create (2);
1917       epilogue_cost_vec.create (2);
1918
1919       int dummy2;
1920       nopeel_outside_cost += vect_get_known_peeling_cost
1921         (loop_vinfo, 0, &dummy2,
1922          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1923          &prologue_cost_vec, &epilogue_cost_vec);
1924
1925       prologue_cost_vec.release ();
1926       epilogue_cost_vec.release ();
1927
1928       npeel = best_peel.peel_info.npeel;
1929       dr0_info = best_peel.peel_info.dr_info;
1930
1931       /* If no peeling is not more expensive than the best peeling we
1932          have so far, don't perform any peeling.  */
1933       if (nopeel_inside_cost <= best_peel.inside_cost)
1934         do_peeling = false;
1935     }
1936
1937   if (do_peeling)
1938     {
1939       stmt_vec_info stmt_info = dr0_info->stmt;
1940       vectype = STMT_VINFO_VECTYPE (stmt_info);
1941
1942       if (known_alignment_for_access_p (dr0_info))
1943         {
1944           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1945                                                 size_zero_node) < 0;
1946           if (!npeel)
1947             {
1948               /* Since it's known at compile time, compute the number of
1949                  iterations in the peeled loop (the peeling factor) for use in
1950                  updating DR_MISALIGNMENT values.  The peeling factor is the
1951                  vectorization factor minus the misalignment as an element
1952                  count.  */
1953               mis = (negative
1954                      ? DR_MISALIGNMENT (dr0_info)
1955                      : -DR_MISALIGNMENT (dr0_info));
1956               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1957               npeel = ((mis & (target_align - 1))
1958                        / vect_get_scalar_dr_size (dr0_info));
1959             }
1960
1961           /* For interleaved data access every iteration accesses all the
1962              members of the group, therefore we divide the number of iterations
1963              by the group size.  */
1964           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1965             npeel /= DR_GROUP_SIZE (stmt_info);
1966
1967           if (dump_enabled_p ())
1968             dump_printf_loc (MSG_NOTE, vect_location,
1969                              "Try peeling by %d\n", npeel);
1970         }
1971
1972       /* Ensure that all datarefs can be vectorized after the peel.  */
1973       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1974         do_peeling = false;
1975
1976       /* Check if all datarefs are supportable and log.  */
1977       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1978         {
1979           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1980           if (!stat)
1981             do_peeling = false;
1982           else
1983             return stat;
1984         }
1985
1986       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1987       if (do_peeling)
1988         {
1989           unsigned max_allowed_peel
1990             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1991           if (max_allowed_peel != (unsigned)-1)
1992             {
1993               unsigned max_peel = npeel;
1994               if (max_peel == 0)
1995                 {
1996                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1997                   max_peel = (target_align
1998                               / vect_get_scalar_dr_size (dr0_info) - 1);
1999                 }
2000               if (max_peel > max_allowed_peel)
2001                 {
2002                   do_peeling = false;
2003                   if (dump_enabled_p ())
2004                     dump_printf_loc (MSG_NOTE, vect_location,
2005                         "Disable peeling, max peels reached: %d\n", max_peel);
2006                 }
2007             }
2008         }
2009
2010       /* Cost model #2 - if peeling may result in a remaining loop not
2011          iterating enough to be vectorized then do not peel.  Since this
2012          is a cost heuristic rather than a correctness decision, use the
2013          most likely runtime value for variable vectorization factors.  */
2014       if (do_peeling
2015           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2016         {
2017           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2018           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2019           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2020               < assumed_vf + max_peel)
2021             do_peeling = false;
2022         }
2023
2024       if (do_peeling)
2025         {
2026           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2027              If the misalignment of DR_i is identical to that of dr0 then set
2028              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2029              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2030              by the peeling factor times the element size of DR_i (MOD the
2031              vectorization factor times the size).  Otherwise, the
2032              misalignment of DR_i must be set to unknown.  */
2033           FOR_EACH_VEC_ELT (datarefs, i, dr)
2034             if (dr != dr0_info->dr)
2035               {
2036                 /* Strided accesses perform only component accesses, alignment
2037                    is irrelevant for them.  */
2038                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2039                 stmt_info = dr_info->stmt;
2040                 if (STMT_VINFO_STRIDED_P (stmt_info)
2041                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2042                   continue;
2043
2044                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2045               }
2046
2047           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2048           if (npeel)
2049             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2050           else
2051             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2052               = DR_MISALIGNMENT (dr0_info);
2053           SET_DR_MISALIGNMENT (dr0_info, 0);
2054           if (dump_enabled_p ())
2055             {
2056               dump_printf_loc (MSG_NOTE, vect_location,
2057                                "Alignment of access forced using peeling.\n");
2058               dump_printf_loc (MSG_NOTE, vect_location,
2059                                "Peeling for alignment will be applied.\n");
2060             }
2061
2062           /* The inside-loop cost will be accounted for in vectorizable_load
2063              and vectorizable_store correctly with adjusted alignments.
2064              Drop the body_cst_vec on the floor here.  */
2065           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2066           gcc_assert (stat);
2067           return stat;
2068         }
2069     }
2070
2071   /* (2) Versioning to force alignment.  */
2072
2073   /* Try versioning if:
2074      1) optimize loop for speed
2075      2) there is at least one unsupported misaligned data ref with an unknown
2076         misalignment, and
2077      3) all misaligned data refs with a known misalignment are supported, and
2078      4) the number of runtime alignment checks is within reason.  */
2079
2080   do_versioning =
2081         optimize_loop_nest_for_speed_p (loop)
2082         && (!loop->inner); /* FORNOW */
2083
2084   if (do_versioning)
2085     {
2086       FOR_EACH_VEC_ELT (datarefs, i, dr)
2087         {
2088           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2089           stmt_vec_info stmt_info = dr_info->stmt;
2090
2091           /* For interleaving, only the alignment of the first access
2092              matters.  */
2093           if (aligned_access_p (dr_info)
2094               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2095                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2096             continue;
2097
2098           if (STMT_VINFO_STRIDED_P (stmt_info))
2099             {
2100               /* Strided loads perform only component accesses, alignment is
2101                  irrelevant for them.  */
2102               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2103                 continue;
2104               do_versioning = false;
2105               break;
2106             }
2107
2108           supportable_dr_alignment
2109             = vect_supportable_dr_alignment (dr_info, false);
2110
2111           if (!supportable_dr_alignment)
2112             {
2113               int mask;
2114               tree vectype;
2115
2116               if (known_alignment_for_access_p (dr_info)
2117                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2118                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2119                 {
2120                   do_versioning = false;
2121                   break;
2122                 }
2123
2124               vectype = STMT_VINFO_VECTYPE (stmt_info);
2125               gcc_assert (vectype);
2126
2127               /* At present we don't support versioning for alignment
2128                  with variable VF, since there's no guarantee that the
2129                  VF is a power of two.  We could relax this if we added
2130                  a way of enforcing a power-of-two size.  */
2131               unsigned HOST_WIDE_INT size;
2132               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2133                 {
2134                   do_versioning = false;
2135                   break;
2136                 }
2137
2138               /* The rightmost bits of an aligned address must be zeros.
2139                  Construct the mask needed for this test.  For example,
2140                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2141                  mask must be 15 = 0xf. */
2142               mask = size - 1;
2143
2144               /* FORNOW: use the same mask to test all potentially unaligned
2145                  references in the loop.  The vectorizer currently supports
2146                  a single vector size, see the reference to
2147                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2148                  vectorization factor is computed.  */
2149               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2150                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2151               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2152               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2153             }
2154         }
2155
2156       /* Versioning requires at least one misaligned data reference.  */
2157       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2158         do_versioning = false;
2159       else if (!do_versioning)
2160         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2161     }
2162
2163   if (do_versioning)
2164     {
2165       vec<stmt_vec_info> may_misalign_stmts
2166         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2167       stmt_vec_info stmt_info;
2168
2169       /* It can now be assumed that the data references in the statements
2170          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2171          of the loop being vectorized.  */
2172       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2173         {
2174           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2175           SET_DR_MISALIGNMENT (dr_info, 0);
2176           if (dump_enabled_p ())
2177             dump_printf_loc (MSG_NOTE, vect_location,
2178                              "Alignment of access forced using versioning.\n");
2179         }
2180
2181       if (dump_enabled_p ())
2182         dump_printf_loc (MSG_NOTE, vect_location,
2183                          "Versioning for alignment will be applied.\n");
2184
2185       /* Peeling and versioning can't be done together at this time.  */
2186       gcc_assert (! (do_peeling && do_versioning));
2187
2188       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2189       gcc_assert (stat);
2190       return stat;
2191     }
2192
2193   /* This point is reached if neither peeling nor versioning is being done.  */
2194   gcc_assert (! (do_peeling || do_versioning));
2195
2196   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2197   return stat;
2198 }
2199
2200
2201 /* Function vect_find_same_alignment_drs.
2202
2203    Update group and alignment relations in VINFO according to the chosen
2204    vectorization factor.  */
2205
2206 static void
2207 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2208 {
2209   struct data_reference *dra = DDR_A (ddr);
2210   struct data_reference *drb = DDR_B (ddr);
2211   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2212   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2213   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2214   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2215
2216   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2217     return;
2218
2219   if (dra == drb)
2220     return;
2221
2222   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2223       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2224     return;
2225
2226   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229     return;
2230
2231   /* Two references with distance zero have the same alignment.  */
2232   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2233                           - wi::to_poly_offset (DR_INIT (drb)));
2234   if (maybe_ne (diff, 0))
2235     {
2236       /* Get the wider of the two alignments.  */
2237       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2238                               / BITS_PER_UNIT);
2239       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2240                               / BITS_PER_UNIT);
2241       unsigned int max_align = MAX (align_a, align_b);
2242
2243       /* Require the gap to be a multiple of the larger vector alignment.  */
2244       if (!multiple_p (diff, max_align))
2245         return;
2246     }
2247
2248   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250   if (dump_enabled_p ())
2251     dump_printf_loc (MSG_NOTE, vect_location,
2252                      "accesses have the same alignment: %T and %T\n",
2253                      DR_REF (dra), DR_REF (drb));
2254 }
2255
2256
2257 /* Function vect_analyze_data_refs_alignment
2258
2259    Analyze the alignment of the data-references in the loop.
2260    Return FALSE if a data reference is found that cannot be vectorized.  */
2261
2262 opt_result
2263 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2264 {
2265   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2266
2267   /* Mark groups of data references with same alignment using
2268      data dependence information.  */
2269   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2270   struct data_dependence_relation *ddr;
2271   unsigned int i;
2272
2273   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2274     vect_find_same_alignment_drs (vinfo, ddr);
2275
2276   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2277   struct data_reference *dr;
2278
2279   vect_record_base_alignments (vinfo);
2280   FOR_EACH_VEC_ELT (datarefs, i, dr)
2281     {
2282       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2283       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2284         vect_compute_data_ref_alignment (dr_info);
2285     }
2286
2287   return opt_result::success ();
2288 }
2289
2290
2291 /* Analyze alignment of DRs of stmts in NODE.  */
2292
2293 static bool
2294 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2295 {
2296   /* We vectorize from the first scalar stmt in the node unless
2297      the node is permuted in which case we start from the first
2298      element in the group.  */
2299   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2300   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2301   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2302     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2303
2304   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2305   vect_compute_data_ref_alignment (dr_info);
2306   /* For creating the data-ref pointer we need alignment of the
2307      first element anyway.  */
2308   if (dr_info != first_dr_info)
2309     vect_compute_data_ref_alignment (first_dr_info);
2310   if (! verify_data_ref_alignment (dr_info))
2311     {
2312       if (dump_enabled_p ())
2313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                          "not vectorized: bad data alignment in basic "
2315                          "block.\n");
2316       return false;
2317     }
2318
2319   return true;
2320 }
2321
2322 /* Function vect_slp_analyze_instance_alignment
2323
2324    Analyze the alignment of the data-references in the SLP instance.
2325    Return FALSE if a data reference is found that cannot be vectorized.  */
2326
2327 bool
2328 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2329 {
2330   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2331
2332   slp_tree node;
2333   unsigned i;
2334   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2335     if (! vect_slp_analyze_and_verify_node_alignment (node))
2336       return false;
2337
2338   node = SLP_INSTANCE_TREE (instance);
2339   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2340       && ! vect_slp_analyze_and_verify_node_alignment
2341              (SLP_INSTANCE_TREE (instance)))
2342     return false;
2343
2344   return true;
2345 }
2346
2347
2348 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2349    accesses of legal size, step, etc.  Detect gaps, single element
2350    interleaving, and other special cases. Set grouped access info.
2351    Collect groups of strided stores for further use in SLP analysis.
2352    Worker for vect_analyze_group_access.  */
2353
2354 static bool
2355 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2356 {
2357   data_reference *dr = dr_info->dr;
2358   tree step = DR_STEP (dr);
2359   tree scalar_type = TREE_TYPE (DR_REF (dr));
2360   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2361   stmt_vec_info stmt_info = dr_info->stmt;
2362   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2363   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2364   HOST_WIDE_INT dr_step = -1;
2365   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2366   bool slp_impossible = false;
2367
2368   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2369      size of the interleaving group (including gaps).  */
2370   if (tree_fits_shwi_p (step))
2371     {
2372       dr_step = tree_to_shwi (step);
2373       /* Check that STEP is a multiple of type size.  Otherwise there is
2374          a non-element-sized gap at the end of the group which we
2375          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2376          ???  As we can handle non-constant step fine here we should
2377          simply remove uses of DR_GROUP_GAP between the last and first
2378          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2379          simply not include that gap.  */
2380       if ((dr_step % type_size) != 0)
2381         {
2382           if (dump_enabled_p ())
2383             dump_printf_loc (MSG_NOTE, vect_location,
2384                              "Step %T is not a multiple of the element size"
2385                              " for %T\n",
2386                              step, DR_REF (dr));
2387           return false;
2388         }
2389       groupsize = absu_hwi (dr_step) / type_size;
2390     }
2391   else
2392     groupsize = 0;
2393
2394   /* Not consecutive access is possible only if it is a part of interleaving.  */
2395   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2396     {
2397       /* Check if it this DR is a part of interleaving, and is a single
2398          element of the group that is accessed in the loop.  */
2399
2400       /* Gaps are supported only for loads. STEP must be a multiple of the type
2401          size.  */
2402       if (DR_IS_READ (dr)
2403           && (dr_step % type_size) == 0
2404           && groupsize > 0)
2405         {
2406           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2407           DR_GROUP_SIZE (stmt_info) = groupsize;
2408           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2409           if (dump_enabled_p ())
2410             dump_printf_loc (MSG_NOTE, vect_location,
2411                              "Detected single element interleaving %T"
2412                              " step %T\n",
2413                              DR_REF (dr), step);
2414
2415           return true;
2416         }
2417
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2420                          "not consecutive access %G", stmt_info->stmt);
2421
2422       if (bb_vinfo)
2423         {
2424           /* Mark the statement as unvectorizable.  */
2425           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2426           return true;
2427         }
2428
2429       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2430       STMT_VINFO_STRIDED_P (stmt_info) = true;
2431       return true;
2432     }
2433
2434   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2435     {
2436       /* First stmt in the interleaving chain. Check the chain.  */
2437       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2438       struct data_reference *data_ref = dr;
2439       unsigned int count = 1;
2440       tree prev_init = DR_INIT (data_ref);
2441       stmt_vec_info prev = stmt_info;
2442       HOST_WIDE_INT diff, gaps = 0;
2443
2444       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2445       while (next)
2446         {
2447           /* Skip same data-refs.  In case that two or more stmts share
2448              data-ref (supported only for loads), we vectorize only the first
2449              stmt, and the rest get their vectorized loads from the first
2450              one.  */
2451           if (!tree_int_cst_compare (DR_INIT (data_ref),
2452                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2453             {
2454               if (DR_IS_WRITE (data_ref))
2455                 {
2456                   if (dump_enabled_p ())
2457                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458                                      "Two store stmts share the same dr.\n");
2459                   return false;
2460                 }
2461
2462               if (dump_enabled_p ())
2463                 dump_printf_loc (MSG_NOTE, vect_location,
2464                                  "Two or more load stmts share the same dr.\n");
2465
2466               /* For load use the same data-ref load.  */
2467               DR_GROUP_SAME_DR_STMT (next) = prev;
2468
2469               prev = next;
2470               next = DR_GROUP_NEXT_ELEMENT (next);
2471               continue;
2472             }
2473
2474           prev = next;
2475           data_ref = STMT_VINFO_DATA_REF (next);
2476
2477           /* All group members have the same STEP by construction.  */
2478           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2479
2480           /* Check that the distance between two accesses is equal to the type
2481              size. Otherwise, we have gaps.  */
2482           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2483                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2484           if (diff != 1)
2485             {
2486               /* FORNOW: SLP of accesses with gaps is not supported.  */
2487               slp_impossible = true;
2488               if (DR_IS_WRITE (data_ref))
2489                 {
2490                   if (dump_enabled_p ())
2491                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2492                                      "interleaved store with gaps\n");
2493                   return false;
2494                 }
2495
2496               gaps += diff - 1;
2497             }
2498
2499           last_accessed_element += diff;
2500
2501           /* Store the gap from the previous member of the group. If there is no
2502              gap in the access, DR_GROUP_GAP is always 1.  */
2503           DR_GROUP_GAP (next) = diff;
2504
2505           prev_init = DR_INIT (data_ref);
2506           next = DR_GROUP_NEXT_ELEMENT (next);
2507           /* Count the number of data-refs in the chain.  */
2508           count++;
2509         }
2510
2511       if (groupsize == 0)
2512         groupsize = count + gaps;
2513
2514       /* This could be UINT_MAX but as we are generating code in a very
2515          inefficient way we have to cap earlier.  See PR78699 for example.  */
2516       if (groupsize > 4096)
2517         {
2518           if (dump_enabled_p ())
2519             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2520                              "group is too large\n");
2521           return false;
2522         }
2523
2524       /* Check that the size of the interleaving is equal to count for stores,
2525          i.e., that there are no gaps.  */
2526       if (groupsize != count
2527           && !DR_IS_READ (dr))
2528         {
2529           groupsize = count;
2530           STMT_VINFO_STRIDED_P (stmt_info) = true;
2531         }
2532
2533       /* If there is a gap after the last load in the group it is the
2534          difference between the groupsize and the last accessed
2535          element.
2536          When there is no gap, this difference should be 0.  */
2537       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2538
2539       DR_GROUP_SIZE (stmt_info) = groupsize;
2540       if (dump_enabled_p ())
2541         {
2542           dump_printf_loc (MSG_NOTE, vect_location,
2543                            "Detected interleaving ");
2544           if (DR_IS_READ (dr))
2545             dump_printf (MSG_NOTE, "load ");
2546           else if (STMT_VINFO_STRIDED_P (stmt_info))
2547             dump_printf (MSG_NOTE, "strided store ");
2548           else
2549             dump_printf (MSG_NOTE, "store ");
2550           dump_printf (MSG_NOTE, "of size %u\n",
2551                        (unsigned)groupsize);
2552           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2553           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2554           while (next)
2555             {
2556               if (DR_GROUP_GAP (next) != 1)
2557                 dump_printf_loc (MSG_NOTE, vect_location,
2558                                  "\t<gap of %d elements>\n",
2559                                  DR_GROUP_GAP (next) - 1);
2560               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2561               next = DR_GROUP_NEXT_ELEMENT (next);
2562             }
2563           if (DR_GROUP_GAP (stmt_info) != 0)
2564             dump_printf_loc (MSG_NOTE, vect_location,
2565                              "\t<gap of %d elements>\n",
2566                              DR_GROUP_GAP (stmt_info));
2567         }
2568
2569       /* SLP: create an SLP data structure for every interleaving group of
2570          stores for further analysis in vect_analyse_slp.  */
2571       if (DR_IS_WRITE (dr) && !slp_impossible)
2572         {
2573           if (loop_vinfo)
2574             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2575           if (bb_vinfo)
2576             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2577         }
2578     }
2579
2580   return true;
2581 }
2582
2583 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2584    accesses of legal size, step, etc.  Detect gaps, single element
2585    interleaving, and other special cases. Set grouped access info.
2586    Collect groups of strided stores for further use in SLP analysis.  */
2587
2588 static bool
2589 vect_analyze_group_access (dr_vec_info *dr_info)
2590 {
2591   if (!vect_analyze_group_access_1 (dr_info))
2592     {
2593       /* Dissolve the group if present.  */
2594       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2595       while (stmt_info)
2596         {
2597           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2598           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2599           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2600           stmt_info = next;
2601         }
2602       return false;
2603     }
2604   return true;
2605 }
2606
2607 /* Analyze the access pattern of the data-reference DR_INFO.
2608    In case of non-consecutive accesses call vect_analyze_group_access() to
2609    analyze groups of accesses.  */
2610
2611 static bool
2612 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2613 {
2614   data_reference *dr = dr_info->dr;
2615   tree step = DR_STEP (dr);
2616   tree scalar_type = TREE_TYPE (DR_REF (dr));
2617   stmt_vec_info stmt_info = dr_info->stmt;
2618   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2619   struct loop *loop = NULL;
2620
2621   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2622     return true;
2623
2624   if (loop_vinfo)
2625     loop = LOOP_VINFO_LOOP (loop_vinfo);
2626
2627   if (loop_vinfo && !step)
2628     {
2629       if (dump_enabled_p ())
2630         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                          "bad data-ref access in loop\n");
2632       return false;
2633     }
2634
2635   /* Allow loads with zero step in inner-loop vectorization.  */
2636   if (loop_vinfo && integer_zerop (step))
2637     {
2638       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2639       if (!nested_in_vect_loop_p (loop, stmt_info))
2640         return DR_IS_READ (dr);
2641       /* Allow references with zero step for outer loops marked
2642          with pragma omp simd only - it guarantees absence of
2643          loop-carried dependencies between inner loop iterations.  */
2644       if (loop->safelen < 2)
2645         {
2646           if (dump_enabled_p ())
2647             dump_printf_loc (MSG_NOTE, vect_location,
2648                              "zero step in inner loop of nest\n");
2649           return false;
2650         }
2651     }
2652
2653   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2654     {
2655       /* Interleaved accesses are not yet supported within outer-loop
2656         vectorization for references in the inner-loop.  */
2657       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2658
2659       /* For the rest of the analysis we use the outer-loop step.  */
2660       step = STMT_VINFO_DR_STEP (stmt_info);
2661       if (integer_zerop (step))
2662         {
2663           if (dump_enabled_p ())
2664             dump_printf_loc (MSG_NOTE, vect_location,
2665                              "zero step in outer loop.\n");
2666           return DR_IS_READ (dr);
2667         }
2668     }
2669
2670   /* Consecutive?  */
2671   if (TREE_CODE (step) == INTEGER_CST)
2672     {
2673       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2674       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2675           || (dr_step < 0
2676               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2677         {
2678           /* Mark that it is not interleaving.  */
2679           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2680           return true;
2681         }
2682     }
2683
2684   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2685     {
2686       if (dump_enabled_p ())
2687         dump_printf_loc (MSG_NOTE, vect_location,
2688                          "grouped access in outer loop.\n");
2689       return false;
2690     }
2691
2692
2693   /* Assume this is a DR handled by non-constant strided load case.  */
2694   if (TREE_CODE (step) != INTEGER_CST)
2695     return (STMT_VINFO_STRIDED_P (stmt_info)
2696             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2697                 || vect_analyze_group_access (dr_info)));
2698
2699   /* Not consecutive access - check if it's a part of interleaving group.  */
2700   return vect_analyze_group_access (dr_info);
2701 }
2702
2703 /* Compare two data-references DRA and DRB to group them into chunks
2704    suitable for grouping.  */
2705
2706 static int
2707 dr_group_sort_cmp (const void *dra_, const void *drb_)
2708 {
2709   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2710   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2711   int cmp;
2712
2713   /* Stabilize sort.  */
2714   if (dra == drb)
2715     return 0;
2716
2717   /* DRs in different loops never belong to the same group.  */
2718   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2719   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2720   if (loopa != loopb)
2721     return loopa->num < loopb->num ? -1 : 1;
2722
2723   /* Ordering of DRs according to base.  */
2724   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2725                                DR_BASE_ADDRESS (drb));
2726   if (cmp != 0)
2727     return cmp;
2728
2729   /* And according to DR_OFFSET.  */
2730   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2731   if (cmp != 0)
2732     return cmp;
2733
2734   /* Put reads before writes.  */
2735   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2736     return DR_IS_READ (dra) ? -1 : 1;
2737
2738   /* Then sort after access size.  */
2739   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2740                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2741   if (cmp != 0)
2742     return cmp;
2743
2744   /* And after step.  */
2745   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2746   if (cmp != 0)
2747     return cmp;
2748
2749   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2750   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2751   if (cmp == 0)
2752     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2753   return cmp;
2754 }
2755
2756 /* If OP is the result of a conversion, return the unconverted value,
2757    otherwise return null.  */
2758
2759 static tree
2760 strip_conversion (tree op)
2761 {
2762   if (TREE_CODE (op) != SSA_NAME)
2763     return NULL_TREE;
2764   gimple *stmt = SSA_NAME_DEF_STMT (op);
2765   if (!is_gimple_assign (stmt)
2766       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2767     return NULL_TREE;
2768   return gimple_assign_rhs1 (stmt);
2769 }
2770
2771 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2772    and STMT2_INFO being in a single group.  */
2773
2774 static bool
2775 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2776 {
2777   if (gimple_assign_single_p (stmt1_info->stmt))
2778     return gimple_assign_single_p (stmt2_info->stmt);
2779
2780   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2781   if (call1 && gimple_call_internal_p (call1))
2782     {
2783       /* Check for two masked loads or two masked stores.  */
2784       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2785       if (!call2 || !gimple_call_internal_p (call2))
2786         return false;
2787       internal_fn ifn = gimple_call_internal_fn (call1);
2788       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2789         return false;
2790       if (ifn != gimple_call_internal_fn (call2))
2791         return false;
2792
2793       /* Check that the masks are the same.  Cope with casts of masks,
2794          like those created by build_mask_conversion.  */
2795       tree mask1 = gimple_call_arg (call1, 2);
2796       tree mask2 = gimple_call_arg (call2, 2);
2797       if (!operand_equal_p (mask1, mask2, 0))
2798         {
2799           mask1 = strip_conversion (mask1);
2800           if (!mask1)
2801             return false;
2802           mask2 = strip_conversion (mask2);
2803           if (!mask2)
2804             return false;
2805           if (!operand_equal_p (mask1, mask2, 0))
2806             return false;
2807         }
2808       return true;
2809     }
2810
2811   return false;
2812 }
2813
2814 /* Function vect_analyze_data_ref_accesses.
2815
2816    Analyze the access pattern of all the data references in the loop.
2817
2818    FORNOW: the only access pattern that is considered vectorizable is a
2819            simple step 1 (consecutive) access.
2820
2821    FORNOW: handle only arrays and pointer accesses.  */
2822
2823 opt_result
2824 vect_analyze_data_ref_accesses (vec_info *vinfo)
2825 {
2826   unsigned int i;
2827   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2828   struct data_reference *dr;
2829
2830   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2831
2832   if (datarefs.is_empty ())
2833     return opt_result::success ();
2834
2835   /* Sort the array of datarefs to make building the interleaving chains
2836      linear.  Don't modify the original vector's order, it is needed for
2837      determining what dependencies are reversed.  */
2838   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2839   datarefs_copy.qsort (dr_group_sort_cmp);
2840   hash_set<stmt_vec_info> to_fixup;
2841
2842   /* Build the interleaving chains.  */
2843   for (i = 0; i < datarefs_copy.length () - 1;)
2844     {
2845       data_reference_p dra = datarefs_copy[i];
2846       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2847       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2848       stmt_vec_info lastinfo = NULL;
2849       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2850           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2851         {
2852           ++i;
2853           continue;
2854         }
2855       for (i = i + 1; i < datarefs_copy.length (); ++i)
2856         {
2857           data_reference_p drb = datarefs_copy[i];
2858           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2859           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2860           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2861               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2862             break;
2863
2864           /* ???  Imperfect sorting (non-compatible types, non-modulo
2865              accesses, same accesses) can lead to a group to be artificially
2866              split here as we don't just skip over those.  If it really
2867              matters we can push those to a worklist and re-iterate
2868              over them.  The we can just skip ahead to the next DR here.  */
2869
2870           /* DRs in a different loop should not be put into the same
2871              interleaving group.  */
2872           if (gimple_bb (DR_STMT (dra))->loop_father
2873               != gimple_bb (DR_STMT (drb))->loop_father)
2874             break;
2875
2876           /* Check that the data-refs have same first location (except init)
2877              and they are both either store or load (not load and store,
2878              not masked loads or stores).  */
2879           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2880               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2881                                         DR_BASE_ADDRESS (drb)) != 0
2882               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2883               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2884             break;
2885
2886           /* Check that the data-refs have the same constant size.  */
2887           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2888           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2889           if (!tree_fits_uhwi_p (sza)
2890               || !tree_fits_uhwi_p (szb)
2891               || !tree_int_cst_equal (sza, szb))
2892             break;
2893
2894           /* Check that the data-refs have the same step.  */
2895           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2896             break;
2897
2898           /* Check the types are compatible.
2899              ???  We don't distinguish this during sorting.  */
2900           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2901                                    TREE_TYPE (DR_REF (drb))))
2902             break;
2903
2904           /* Check that the DR_INITs are compile-time constants.  */
2905           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2906               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2907             break;
2908
2909           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2910           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2911           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2912           HOST_WIDE_INT init_prev
2913             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2914           gcc_assert (init_a <= init_b
2915                       && init_a <= init_prev
2916                       && init_prev <= init_b);
2917
2918           /* Do not place the same access in the interleaving chain twice.  */
2919           if (init_b == init_prev)
2920             {
2921               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2922                           < gimple_uid (DR_STMT (drb)));
2923               /* Simply link in duplicates and fix up the chain below.  */
2924             }
2925           else
2926             {
2927               /* If init_b == init_a + the size of the type * k, we have an
2928                  interleaving, and DRA is accessed before DRB.  */
2929               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2930               if (type_size_a == 0
2931                   || (init_b - init_a) % type_size_a != 0)
2932                 break;
2933
2934               /* If we have a store, the accesses are adjacent.  This splits
2935                  groups into chunks we support (we don't support vectorization
2936                  of stores with gaps).  */
2937               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2938                 break;
2939
2940               /* If the step (if not zero or non-constant) is greater than the
2941                  difference between data-refs' inits this splits groups into
2942                  suitable sizes.  */
2943               if (tree_fits_shwi_p (DR_STEP (dra)))
2944                 {
2945                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2946                   if (step != 0 && step <= (init_b - init_a))
2947                     break;
2948                 }
2949             }
2950
2951           if (dump_enabled_p ())
2952             dump_printf_loc (MSG_NOTE, vect_location,
2953                              DR_IS_READ (dra)
2954                              ? "Detected interleaving load %T and %T\n"
2955                              : "Detected interleaving store %T and %T\n",
2956                              DR_REF (dra), DR_REF (drb));
2957
2958           /* Link the found element into the group list.  */
2959           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2960             {
2961               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2962               lastinfo = stmtinfo_a;
2963             }
2964           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2965           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2966           lastinfo = stmtinfo_b;
2967
2968           if (init_b == init_prev
2969               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2970               && dump_enabled_p ())
2971             dump_printf_loc (MSG_NOTE, vect_location,
2972                              "Queuing group with duplicate access for fixup\n");
2973         }
2974     }
2975
2976   /* Fixup groups with duplicate entries by splitting it.  */
2977   while (1)
2978     {
2979       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
2980       if (!(it != to_fixup.end ()))
2981         break;
2982       stmt_vec_info grp = *it;
2983       to_fixup.remove (grp);
2984
2985       /* Find the earliest duplicate group member.  */
2986       unsigned first_duplicate = -1u;
2987       stmt_vec_info next, g = grp;
2988       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
2989         {
2990           if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
2991                == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
2992               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
2993             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
2994           g = next;
2995         }
2996       if (first_duplicate == -1U)
2997         continue;
2998
2999       /* Then move all stmts after the first duplicate to a new group.
3000          Note this is a heuristic but one with the property that *it
3001          is fixed up completely.  */
3002       g = grp;
3003       stmt_vec_info newgroup = NULL, ng = grp;
3004       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3005         {
3006           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3007             {
3008               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3009               if (!newgroup)
3010                 newgroup = next;
3011               else
3012                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3013               ng = next;
3014               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3015             }
3016           else
3017             g = DR_GROUP_NEXT_ELEMENT (g);
3018         }
3019       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3020
3021       /* Fixup the new group which still may contain duplicates.  */
3022       to_fixup.add (newgroup);
3023     }
3024
3025   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3026     {
3027       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3028       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3029           && !vect_analyze_data_ref_access (dr_info))
3030         {
3031           if (dump_enabled_p ())
3032             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3033                              "not vectorized: complicated access pattern.\n");
3034
3035           if (is_a <bb_vec_info> (vinfo))
3036             {
3037               /* Mark the statement as not vectorizable.  */
3038               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3039               continue;
3040             }
3041           else
3042             {
3043               datarefs_copy.release ();
3044               return opt_result::failure_at (dr_info->stmt->stmt,
3045                                              "not vectorized:"
3046                                              " complicated access pattern.\n");
3047             }
3048         }
3049     }
3050
3051   datarefs_copy.release ();
3052   return opt_result::success ();
3053 }
3054
3055 /* Function vect_vfa_segment_size.
3056
3057    Input:
3058      DR_INFO: The data reference.
3059      LENGTH_FACTOR: segment length to consider.
3060
3061    Return a value suitable for the dr_with_seg_len::seg_len field.
3062    This is the "distance travelled" by the pointer from the first
3063    iteration in the segment to the last.  Note that it does not include
3064    the size of the access; in effect it only describes the first byte.  */
3065
3066 static tree
3067 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3068 {
3069   length_factor = size_binop (MINUS_EXPR,
3070                               fold_convert (sizetype, length_factor),
3071                               size_one_node);
3072   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3073                      length_factor);
3074 }
3075
3076 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3077    gives the worst-case number of bytes covered by the segment.  */
3078
3079 static unsigned HOST_WIDE_INT
3080 vect_vfa_access_size (dr_vec_info *dr_info)
3081 {
3082   stmt_vec_info stmt_vinfo = dr_info->stmt;
3083   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3084   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3085   unsigned HOST_WIDE_INT access_size = ref_size;
3086   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3087     {
3088       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3089       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3090     }
3091   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3092       && (vect_supportable_dr_alignment (dr_info, false)
3093           == dr_explicit_realign_optimized))
3094     {
3095       /* We might access a full vector's worth.  */
3096       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3097       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3098     }
3099   return access_size;
3100 }
3101
3102 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3103    describes.  */
3104
3105 static unsigned int
3106 vect_vfa_align (dr_vec_info *dr_info)
3107 {
3108   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3109 }
3110
3111 /* Function vect_no_alias_p.
3112
3113    Given data references A and B with equal base and offset, see whether
3114    the alias relation can be decided at compilation time.  Return 1 if
3115    it can and the references alias, 0 if it can and the references do
3116    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3117    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3118    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3119
3120 static int
3121 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3122                          tree segment_length_a, tree segment_length_b,
3123                          unsigned HOST_WIDE_INT access_size_a,
3124                          unsigned HOST_WIDE_INT access_size_b)
3125 {
3126   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3127   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3128   poly_uint64 const_length_a;
3129   poly_uint64 const_length_b;
3130
3131   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3132      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3133      [a, a+12) */
3134   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3135     {
3136       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3137       offset_a = (offset_a + access_size_a) - const_length_a;
3138     }
3139   else
3140     const_length_a = tree_to_poly_uint64 (segment_length_a);
3141   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3142     {
3143       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3144       offset_b = (offset_b + access_size_b) - const_length_b;
3145     }
3146   else
3147     const_length_b = tree_to_poly_uint64 (segment_length_b);
3148
3149   const_length_a += access_size_a;
3150   const_length_b += access_size_b;
3151
3152   if (ranges_known_overlap_p (offset_a, const_length_a,
3153                               offset_b, const_length_b))
3154     return 1;
3155
3156   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3157                                offset_b, const_length_b))
3158     return 0;
3159
3160   return -1;
3161 }
3162
3163 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3164    in DDR is >= VF.  */
3165
3166 static bool
3167 dependence_distance_ge_vf (data_dependence_relation *ddr,
3168                            unsigned int loop_depth, poly_uint64 vf)
3169 {
3170   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3171       || DDR_NUM_DIST_VECTS (ddr) == 0)
3172     return false;
3173
3174   /* If the dependence is exact, we should have limited the VF instead.  */
3175   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3176
3177   unsigned int i;
3178   lambda_vector dist_v;
3179   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3180     {
3181       HOST_WIDE_INT dist = dist_v[loop_depth];
3182       if (dist != 0
3183           && !(dist > 0 && DDR_REVERSED_P (ddr))
3184           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3185         return false;
3186     }
3187
3188   if (dump_enabled_p ())
3189     dump_printf_loc (MSG_NOTE, vect_location,
3190                      "dependence distance between %T and %T is >= VF\n",
3191                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3192
3193   return true;
3194 }
3195
3196 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3197
3198 static void
3199 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3200 {
3201   dump_printf (dump_kind, "%s (%T) >= ",
3202                lower_bound.unsigned_p ? "unsigned" : "abs",
3203                lower_bound.expr);
3204   dump_dec (dump_kind, lower_bound.min_value);
3205 }
3206
3207 /* Record that the vectorized loop requires the vec_lower_bound described
3208    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3209
3210 static void
3211 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3212                         poly_uint64 min_value)
3213 {
3214   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3215   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3216     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3217       {
3218         unsigned_p &= lower_bounds[i].unsigned_p;
3219         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3220         if (lower_bounds[i].unsigned_p != unsigned_p
3221             || maybe_lt (lower_bounds[i].min_value, min_value))
3222           {
3223             lower_bounds[i].unsigned_p = unsigned_p;
3224             lower_bounds[i].min_value = min_value;
3225             if (dump_enabled_p ())
3226               {
3227                 dump_printf_loc (MSG_NOTE, vect_location,
3228                                  "updating run-time check to ");
3229                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3230                 dump_printf (MSG_NOTE, "\n");
3231               }
3232           }
3233         return;
3234       }
3235
3236   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3237   if (dump_enabled_p ())
3238     {
3239       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3240       dump_lower_bound (MSG_NOTE, lower_bound);
3241       dump_printf (MSG_NOTE, "\n");
3242     }
3243   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3244 }
3245
3246 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3247    will span fewer than GAP bytes.  */
3248
3249 static bool
3250 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3251                   poly_int64 gap)
3252 {
3253   stmt_vec_info stmt_info = dr_info->stmt;
3254   HOST_WIDE_INT count
3255     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3256   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3257     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3258   return (estimated_poly_value (gap)
3259           <= count * vect_get_scalar_dr_size (dr_info));
3260 }
3261
3262 /* Return true if we know that there is no alias between DR_INFO_A and
3263    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3264    When returning true, set *LOWER_BOUND_OUT to this N.  */
3265
3266 static bool
3267 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3268                                 poly_uint64 *lower_bound_out)
3269 {
3270   /* Check that there is a constant gap of known sign between DR_A
3271      and DR_B.  */
3272   data_reference *dr_a = dr_info_a->dr;
3273   data_reference *dr_b = dr_info_b->dr;
3274   poly_int64 init_a, init_b;
3275   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3276       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3277       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3278       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3279       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3280       || !ordered_p (init_a, init_b))
3281     return false;
3282
3283   /* Sort DR_A and DR_B by the address they access.  */
3284   if (maybe_lt (init_b, init_a))
3285     {
3286       std::swap (init_a, init_b);
3287       std::swap (dr_info_a, dr_info_b);
3288       std::swap (dr_a, dr_b);
3289     }
3290
3291   /* If the two accesses could be dependent within a scalar iteration,
3292      make sure that we'd retain their order.  */
3293   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3294       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3295     return false;
3296
3297   /* There is no alias if abs (DR_STEP) is greater than or equal to
3298      the bytes spanned by the combination of the two accesses.  */
3299   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3300   return true;
3301 }
3302
3303 /* Function vect_prune_runtime_alias_test_list.
3304
3305    Prune a list of ddrs to be tested at run-time by versioning for alias.
3306    Merge several alias checks into one if possible.
3307    Return FALSE if resulting list of ddrs is longer then allowed by
3308    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3309
3310 opt_result
3311 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3312 {
3313   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3314   hash_set <tree_pair_hash> compared_objects;
3315
3316   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3317   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3318     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3319   vec<vec_object_pair> &check_unequal_addrs
3320     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3321   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3322   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3323
3324   ddr_p ddr;
3325   unsigned int i;
3326   tree length_factor;
3327
3328   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3329
3330   /* Step values are irrelevant for aliasing if the number of vector
3331      iterations is equal to the number of scalar iterations (which can
3332      happen for fully-SLP loops).  */
3333   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3334
3335   if (!ignore_step_p)
3336     {
3337       /* Convert the checks for nonzero steps into bound tests.  */
3338       tree value;
3339       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3340         vect_check_lower_bound (loop_vinfo, value, true, 1);
3341     }
3342
3343   if (may_alias_ddrs.is_empty ())
3344     return opt_result::success ();
3345
3346   comp_alias_ddrs.create (may_alias_ddrs.length ());
3347
3348   unsigned int loop_depth
3349     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3350                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3351
3352   /* First, we collect all data ref pairs for aliasing checks.  */
3353   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3354     {
3355       int comp_res;
3356       poly_uint64 lower_bound;
3357       tree segment_length_a, segment_length_b;
3358       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3359       unsigned int align_a, align_b;
3360
3361       /* Ignore the alias if the VF we chose ended up being no greater
3362          than the dependence distance.  */
3363       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3364         continue;
3365
3366       if (DDR_OBJECT_A (ddr))
3367         {
3368           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3369           if (!compared_objects.add (new_pair))
3370             {
3371               if (dump_enabled_p ())
3372                 dump_printf_loc (MSG_NOTE, vect_location,
3373                                  "checking that %T and %T"
3374                                  " have different addresses\n",
3375                                  new_pair.first, new_pair.second);
3376               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3377             }
3378           continue;
3379         }
3380
3381       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3382       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3383
3384       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3385       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3386
3387       /* Skip the pair if inter-iteration dependencies are irrelevant
3388          and intra-iteration dependencies are guaranteed to be honored.  */
3389       if (ignore_step_p
3390           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3391               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3392                                                  &lower_bound)))
3393         {
3394           if (dump_enabled_p ())
3395             dump_printf_loc (MSG_NOTE, vect_location,
3396                              "no need for alias check between "
3397                              "%T and %T when VF is 1\n",
3398                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3399           continue;
3400         }
3401
3402       /* See whether we can handle the alias using a bounds check on
3403          the step, and whether that's likely to be the best approach.
3404          (It might not be, for example, if the minimum step is much larger
3405          than the number of bytes handled by one vector iteration.)  */
3406       if (!ignore_step_p
3407           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3408           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3409                                              &lower_bound)
3410           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3411               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3412         {
3413           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3414           if (dump_enabled_p ())
3415             {
3416               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3417                                "%T and %T when the step %T is outside ",
3418                                DR_REF (dr_info_a->dr),
3419                                DR_REF (dr_info_b->dr),
3420                                DR_STEP (dr_info_a->dr));
3421               if (unsigned_p)
3422                 dump_printf (MSG_NOTE, "[0");
3423               else
3424                 {
3425                   dump_printf (MSG_NOTE, "(");
3426                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3427                 }
3428               dump_printf (MSG_NOTE, ", ");
3429               dump_dec (MSG_NOTE, lower_bound);
3430               dump_printf (MSG_NOTE, ")\n");
3431             }
3432           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3433                                   unsigned_p, lower_bound);
3434           continue;
3435         }
3436
3437       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3438       if (dr_group_first_a)
3439         {
3440           stmt_info_a = dr_group_first_a;
3441           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3442         }
3443
3444       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3445       if (dr_group_first_b)
3446         {
3447           stmt_info_b = dr_group_first_b;
3448           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3449         }
3450
3451       if (ignore_step_p)
3452         {
3453           segment_length_a = size_zero_node;
3454           segment_length_b = size_zero_node;
3455         }
3456       else
3457         {
3458           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3459                                 DR_STEP (dr_info_b->dr), 0))
3460             length_factor = scalar_loop_iters;
3461           else
3462             length_factor = size_int (vect_factor);
3463           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3464           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3465         }
3466       access_size_a = vect_vfa_access_size (dr_info_a);
3467       access_size_b = vect_vfa_access_size (dr_info_b);
3468       align_a = vect_vfa_align (dr_info_a);
3469       align_b = vect_vfa_align (dr_info_b);
3470
3471       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3472                                         DR_BASE_ADDRESS (dr_info_b->dr));
3473       if (comp_res == 0)
3474         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3475                                           DR_OFFSET (dr_info_b->dr));
3476
3477       /* See whether the alias is known at compilation time.  */
3478       if (comp_res == 0
3479           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3480           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3481           && poly_int_tree_p (segment_length_a)
3482           && poly_int_tree_p (segment_length_b))
3483         {
3484           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3485                                              segment_length_a,
3486                                              segment_length_b,
3487                                              access_size_a,
3488                                              access_size_b);
3489           if (res >= 0 && dump_enabled_p ())
3490             {
3491               dump_printf_loc (MSG_NOTE, vect_location,
3492                                "can tell at compile time that %T and %T",
3493                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3494               if (res == 0)
3495                 dump_printf (MSG_NOTE, " do not alias\n");
3496               else
3497                 dump_printf (MSG_NOTE, " alias\n");
3498             }
3499
3500           if (res == 0)
3501             continue;
3502
3503           if (res == 1)
3504             return opt_result::failure_at (stmt_info_b->stmt,
3505                                            "not vectorized:"
3506                                            " compilation time alias: %G%G",
3507                                            stmt_info_a->stmt,
3508                                            stmt_info_b->stmt);
3509         }
3510
3511       dr_with_seg_len_pair_t dr_with_seg_len_pair
3512         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3513                           access_size_a, align_a),
3514          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3515                           access_size_b, align_b));
3516
3517       /* Canonicalize pairs by sorting the two DR members.  */
3518       if (comp_res > 0)
3519         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3520
3521       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3522     }
3523
3524   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3525
3526   unsigned int count = (comp_alias_ddrs.length ()
3527                         + check_unequal_addrs.length ());
3528
3529   dump_printf_loc (MSG_NOTE, vect_location,
3530                    "improved number of alias checks from %d to %d\n",
3531                    may_alias_ddrs.length (), count);
3532   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3533     return opt_result::failure_at
3534       (vect_location,
3535        "number of versioning for alias "
3536        "run-time tests exceeds %d "
3537        "(--param vect-max-version-for-alias-checks)\n",
3538        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3539
3540   return opt_result::success ();
3541 }
3542
3543 /* Check whether we can use an internal function for a gather load
3544    or scatter store.  READ_P is true for loads and false for stores.
3545    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3546    the type of the memory elements being loaded or stored.  OFFSET_BITS
3547    is the number of bits in each scalar offset and OFFSET_SIGN is the
3548    sign of the offset.  SCALE is the amount by which the offset should
3549    be multiplied *after* it has been converted to address width.
3550
3551    Return true if the function is supported, storing the function
3552    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3553
3554 bool
3555 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3556                           tree memory_type, unsigned int offset_bits,
3557                           signop offset_sign, int scale,
3558                           internal_fn *ifn_out, tree *element_type_out)
3559 {
3560   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3561   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3562   if (offset_bits > element_bits)
3563     /* Internal functions require the offset to be the same width as
3564        the vector elements.  We can extend narrower offsets, but it isn't
3565        safe to truncate wider offsets.  */
3566     return false;
3567
3568   if (element_bits != memory_bits)
3569     /* For now the vector elements must be the same width as the
3570        memory elements.  */
3571     return false;
3572
3573   /* Work out which function we need.  */
3574   internal_fn ifn;
3575   if (read_p)
3576     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3577   else
3578     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3579
3580   /* Test whether the target supports this combination.  */
3581   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3582                                                offset_sign, scale))
3583     return false;
3584
3585   *ifn_out = ifn;
3586   *element_type_out = TREE_TYPE (vectype);
3587   return true;
3588 }
3589
3590 /* STMT_INFO is a call to an internal gather load or scatter store function.
3591    Describe the operation in INFO.  */
3592
3593 static void
3594 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3595                                    gather_scatter_info *info)
3596 {
3597   gcall *call = as_a <gcall *> (stmt_info->stmt);
3598   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3599   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3600
3601   info->ifn = gimple_call_internal_fn (call);
3602   info->decl = NULL_TREE;
3603   info->base = gimple_call_arg (call, 0);
3604   info->offset = gimple_call_arg (call, 1);
3605   info->offset_dt = vect_unknown_def_type;
3606   info->offset_vectype = NULL_TREE;
3607   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3608   info->element_type = TREE_TYPE (vectype);
3609   info->memory_type = TREE_TYPE (DR_REF (dr));
3610 }
3611
3612 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3613    gather load or scatter store.  Describe the operation in *INFO if so.  */
3614
3615 bool
3616 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3617                            gather_scatter_info *info)
3618 {
3619   HOST_WIDE_INT scale = 1;
3620   poly_int64 pbitpos, pbitsize;
3621   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3622   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3623   tree offtype = NULL_TREE;
3624   tree decl = NULL_TREE, base, off;
3625   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3626   tree memory_type = TREE_TYPE (DR_REF (dr));
3627   machine_mode pmode;
3628   int punsignedp, reversep, pvolatilep = 0;
3629   internal_fn ifn;
3630   tree element_type;
3631   bool masked_p = false;
3632
3633   /* See whether this is already a call to a gather/scatter internal function.
3634      If not, see whether it's a masked load or store.  */
3635   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3636   if (call && gimple_call_internal_p (call))
3637     {
3638       ifn = gimple_call_internal_fn (call);
3639       if (internal_gather_scatter_fn_p (ifn))
3640         {
3641           vect_describe_gather_scatter_call (stmt_info, info);
3642           return true;
3643         }
3644       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3645     }
3646
3647   /* True if we should aim to use internal functions rather than
3648      built-in functions.  */
3649   bool use_ifn_p = (DR_IS_READ (dr)
3650                     ? supports_vec_gather_load_p ()
3651                     : supports_vec_scatter_store_p ());
3652
3653   base = DR_REF (dr);
3654   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3655      see if we can use the def stmt of the address.  */
3656   if (masked_p
3657       && TREE_CODE (base) == MEM_REF
3658       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3659       && integer_zerop (TREE_OPERAND (base, 1))
3660       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3661     {
3662       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3663       if (is_gimple_assign (def_stmt)
3664           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3665         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3666     }
3667
3668   /* The gather and scatter builtins need address of the form
3669      loop_invariant + vector * {1, 2, 4, 8}
3670      or
3671      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3672      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3673      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3674      multiplications and additions in it.  To get a vector, we need
3675      a single SSA_NAME that will be defined in the loop and will
3676      contain everything that is not loop invariant and that can be
3677      vectorized.  The following code attempts to find such a preexistng
3678      SSA_NAME OFF and put the loop invariants into a tree BASE
3679      that can be gimplified before the loop.  */
3680   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3681                               &punsignedp, &reversep, &pvolatilep);
3682   if (reversep)
3683     return false;
3684
3685   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3686
3687   if (TREE_CODE (base) == MEM_REF)
3688     {
3689       if (!integer_zerop (TREE_OPERAND (base, 1)))
3690         {
3691           if (off == NULL_TREE)
3692             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3693           else
3694             off = size_binop (PLUS_EXPR, off,
3695                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3696         }
3697       base = TREE_OPERAND (base, 0);
3698     }
3699   else
3700     base = build_fold_addr_expr (base);
3701
3702   if (off == NULL_TREE)
3703     off = size_zero_node;
3704
3705   /* If base is not loop invariant, either off is 0, then we start with just
3706      the constant offset in the loop invariant BASE and continue with base
3707      as OFF, otherwise give up.
3708      We could handle that case by gimplifying the addition of base + off
3709      into some SSA_NAME and use that as off, but for now punt.  */
3710   if (!expr_invariant_in_loop_p (loop, base))
3711     {
3712       if (!integer_zerop (off))
3713         return false;
3714       off = base;
3715       base = size_int (pbytepos);
3716     }
3717   /* Otherwise put base + constant offset into the loop invariant BASE
3718      and continue with OFF.  */
3719   else
3720     {
3721       base = fold_convert (sizetype, base);
3722       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3723     }
3724
3725   /* OFF at this point may be either a SSA_NAME or some tree expression
3726      from get_inner_reference.  Try to peel off loop invariants from it
3727      into BASE as long as possible.  */
3728   STRIP_NOPS (off);
3729   while (offtype == NULL_TREE)
3730     {
3731       enum tree_code code;
3732       tree op0, op1, add = NULL_TREE;
3733
3734       if (TREE_CODE (off) == SSA_NAME)
3735         {
3736           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3737
3738           if (expr_invariant_in_loop_p (loop, off))
3739             return false;
3740
3741           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3742             break;
3743
3744           op0 = gimple_assign_rhs1 (def_stmt);
3745           code = gimple_assign_rhs_code (def_stmt);
3746           op1 = gimple_assign_rhs2 (def_stmt);
3747         }
3748       else
3749         {
3750           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3751             return false;
3752           code = TREE_CODE (off);
3753           extract_ops_from_tree (off, &code, &op0, &op1);
3754         }
3755       switch (code)
3756         {
3757         case POINTER_PLUS_EXPR:
3758         case PLUS_EXPR:
3759           if (expr_invariant_in_loop_p (loop, op0))
3760             {
3761               add = op0;
3762               off = op1;
3763             do_add:
3764               add = fold_convert (sizetype, add);
3765               if (scale != 1)
3766                 add = size_binop (MULT_EXPR, add, size_int (scale));
3767               base = size_binop (PLUS_EXPR, base, add);
3768               continue;
3769             }
3770           if (expr_invariant_in_loop_p (loop, op1))
3771             {
3772               add = op1;
3773               off = op0;
3774               goto do_add;
3775             }
3776           break;
3777         case MINUS_EXPR:
3778           if (expr_invariant_in_loop_p (loop, op1))
3779             {
3780               add = fold_convert (sizetype, op1);
3781               add = size_binop (MINUS_EXPR, size_zero_node, add);
3782               off = op0;
3783               goto do_add;
3784             }
3785           break;
3786         case MULT_EXPR:
3787           if (scale == 1 && tree_fits_shwi_p (op1))
3788             {
3789               int new_scale = tree_to_shwi (op1);
3790               /* Only treat this as a scaling operation if the target
3791                  supports it.  */
3792               if (use_ifn_p
3793                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3794                                                 vectype, memory_type, 1,
3795                                                 TYPE_SIGN (TREE_TYPE (op0)),
3796                                                 new_scale, &ifn,
3797                                                 &element_type))
3798                 break;
3799               scale = new_scale;
3800               off = op0;
3801               continue;
3802             }
3803           break;
3804         case SSA_NAME:
3805           off = op0;
3806           continue;
3807         CASE_CONVERT:
3808           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3809               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3810             break;
3811           if (TYPE_PRECISION (TREE_TYPE (op0))
3812               == TYPE_PRECISION (TREE_TYPE (off)))
3813             {
3814               off = op0;
3815               continue;
3816             }
3817
3818           /* The internal functions need the offset to be the same width
3819              as the elements of VECTYPE.  Don't include operations that
3820              cast the offset from that width to a different width.  */
3821           if (use_ifn_p
3822               && (int_size_in_bytes (TREE_TYPE (vectype))
3823                   == int_size_in_bytes (TREE_TYPE (off))))
3824             break;
3825
3826           if (TYPE_PRECISION (TREE_TYPE (op0))
3827               < TYPE_PRECISION (TREE_TYPE (off)))
3828             {
3829               off = op0;
3830               offtype = TREE_TYPE (off);
3831               STRIP_NOPS (off);
3832               continue;
3833             }
3834           break;
3835         default:
3836           break;
3837         }
3838       break;
3839     }
3840
3841   /* If at the end OFF still isn't a SSA_NAME or isn't
3842      defined in the loop, punt.  */
3843   if (TREE_CODE (off) != SSA_NAME
3844       || expr_invariant_in_loop_p (loop, off))
3845     return false;
3846
3847   if (offtype == NULL_TREE)
3848     offtype = TREE_TYPE (off);
3849
3850   if (use_ifn_p)
3851     {
3852       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3853                                      memory_type, TYPE_PRECISION (offtype),
3854                                      TYPE_SIGN (offtype), scale, &ifn,
3855                                      &element_type))
3856         return false;
3857     }
3858   else
3859     {
3860       if (DR_IS_READ (dr))
3861         {
3862           if (targetm.vectorize.builtin_gather)
3863             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3864         }
3865       else
3866         {
3867           if (targetm.vectorize.builtin_scatter)
3868             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3869         }
3870
3871       if (!decl)
3872         return false;
3873
3874       ifn = IFN_LAST;
3875       element_type = TREE_TYPE (vectype);
3876     }
3877
3878   info->ifn = ifn;
3879   info->decl = decl;
3880   info->base = base;
3881   info->offset = off;
3882   info->offset_dt = vect_unknown_def_type;
3883   info->offset_vectype = NULL_TREE;
3884   info->scale = scale;
3885   info->element_type = element_type;
3886   info->memory_type = memory_type;
3887   return true;
3888 }
3889
3890 /* Find the data references in STMT, analyze them with respect to LOOP and
3891    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3892    be handled.  */
3893
3894 opt_result
3895 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3896                                vec<data_reference_p> *datarefs)
3897 {
3898   /* We can ignore clobbers for dataref analysis - they are removed during
3899      loop vectorization and BB vectorization checks dependences with a
3900      stmt walk.  */
3901   if (gimple_clobber_p (stmt))
3902     return opt_result::success ();
3903
3904   if (gimple_has_volatile_ops (stmt))
3905     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3906                                    stmt);
3907
3908   if (stmt_can_throw_internal (cfun, stmt))
3909     return opt_result::failure_at (stmt,
3910                                    "not vectorized:"
3911                                    " statement can throw an exception: %G",
3912                                    stmt);
3913
3914   auto_vec<data_reference_p, 2> refs;
3915   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3916   if (!res)
3917     return res;
3918
3919   if (refs.is_empty ())
3920     return opt_result::success ();
3921
3922   if (refs.length () > 1)
3923     return opt_result::failure_at (stmt,
3924                                    "not vectorized:"
3925                                    " more than one data ref in stmt: %G", stmt);
3926
3927   if (gcall *call = dyn_cast <gcall *> (stmt))
3928     if (!gimple_call_internal_p (call)
3929         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3930             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3931       return opt_result::failure_at (stmt,
3932                                      "not vectorized: dr in a call %G", stmt);
3933
3934   data_reference_p dr = refs.pop ();
3935   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3936       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3937     return opt_result::failure_at (stmt,
3938                                    "not vectorized:"
3939                                    " statement is bitfield access %G", stmt);
3940
3941   if (DR_BASE_ADDRESS (dr)
3942       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3943     return opt_result::failure_at (stmt,
3944                                    "not vectorized:"
3945                                    " base addr of dr is a constant\n");
3946
3947   /* Check whether this may be a SIMD lane access and adjust the
3948      DR to make it easier for us to handle it.  */
3949   if (loop
3950       && loop->simduid
3951       && (!DR_BASE_ADDRESS (dr)
3952           || !DR_OFFSET (dr)
3953           || !DR_INIT (dr)
3954           || !DR_STEP (dr)))
3955     {
3956       struct data_reference *newdr
3957         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3958                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3959       if (DR_BASE_ADDRESS (newdr)
3960           && DR_OFFSET (newdr)
3961           && DR_INIT (newdr)
3962           && DR_STEP (newdr)
3963           && integer_zerop (DR_STEP (newdr)))
3964         {
3965           tree off = DR_OFFSET (newdr);
3966           STRIP_NOPS (off);
3967           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3968               && TREE_CODE (off) == MULT_EXPR
3969               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3970             {
3971               tree step = TREE_OPERAND (off, 1);
3972               off = TREE_OPERAND (off, 0);
3973               STRIP_NOPS (off);
3974               if (CONVERT_EXPR_P (off)
3975                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3976                       < TYPE_PRECISION (TREE_TYPE (off))))
3977                 off = TREE_OPERAND (off, 0);
3978               if (TREE_CODE (off) == SSA_NAME)
3979                 {
3980                   gimple *def = SSA_NAME_DEF_STMT (off);
3981                   tree reft = TREE_TYPE (DR_REF (newdr));
3982                   if (is_gimple_call (def)
3983                       && gimple_call_internal_p (def)
3984                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3985                     {
3986                       tree arg = gimple_call_arg (def, 0);
3987                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
3988                       arg = SSA_NAME_VAR (arg);
3989                       if (arg == loop->simduid
3990                           /* For now.  */
3991                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3992                         {
3993                           DR_OFFSET (newdr) = ssize_int (0);
3994                           DR_STEP (newdr) = step;
3995                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3996                           DR_STEP_ALIGNMENT (newdr)
3997                             = highest_pow2_factor (step);
3998                           /* Mark as simd-lane access.  */
3999                           newdr->aux = (void *)-1;
4000                           free_data_ref (dr);
4001                           datarefs->safe_push (newdr);
4002                           return opt_result::success ();
4003                         }
4004                     }
4005                 }
4006             }
4007         }
4008       free_data_ref (newdr);
4009     }
4010
4011   datarefs->safe_push (dr);
4012   return opt_result::success ();
4013 }
4014
4015 /* Function vect_analyze_data_refs.
4016
4017   Find all the data references in the loop or basic block.
4018
4019    The general structure of the analysis of data refs in the vectorizer is as
4020    follows:
4021    1- vect_analyze_data_refs(loop/bb): call
4022       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4023       in the loop/bb and their dependences.
4024    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4025    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4026    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4027
4028 */
4029
4030 opt_result
4031 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4032 {
4033   struct loop *loop = NULL;
4034   unsigned int i;
4035   struct data_reference *dr;
4036   tree scalar_type;
4037
4038   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4039
4040   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4041     loop = LOOP_VINFO_LOOP (loop_vinfo);
4042
4043   /* Go through the data-refs, check that the analysis succeeded.  Update
4044      pointer from stmt_vec_info struct to DR and vectype.  */
4045
4046   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4047   FOR_EACH_VEC_ELT (datarefs, i, dr)
4048     {
4049       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4050       poly_uint64 vf;
4051
4052       gcc_assert (DR_REF (dr));
4053       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4054       gcc_assert (!stmt_info->dr_aux.dr);
4055       stmt_info->dr_aux.dr = dr;
4056       stmt_info->dr_aux.stmt = stmt_info;
4057
4058       /* Check that analysis of the data-ref succeeded.  */
4059       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4060           || !DR_STEP (dr))
4061         {
4062           bool maybe_gather
4063             = DR_IS_READ (dr)
4064               && !TREE_THIS_VOLATILE (DR_REF (dr))
4065               && (targetm.vectorize.builtin_gather != NULL
4066                   || supports_vec_gather_load_p ());
4067           bool maybe_scatter
4068             = DR_IS_WRITE (dr)
4069               && !TREE_THIS_VOLATILE (DR_REF (dr))
4070               && (targetm.vectorize.builtin_scatter != NULL
4071                   || supports_vec_scatter_store_p ());
4072
4073           /* If target supports vector gather loads or scatter stores,
4074              see if they can't be used.  */
4075           if (is_a <loop_vec_info> (vinfo)
4076               && !nested_in_vect_loop_p (loop, stmt_info))
4077             {
4078               if (maybe_gather || maybe_scatter)
4079                 {
4080                   if (maybe_gather)
4081                     gatherscatter = GATHER;
4082                   else
4083                     gatherscatter = SCATTER;
4084                 }
4085             }
4086
4087           if (gatherscatter == SG_NONE)
4088             {
4089               if (dump_enabled_p ())
4090                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4091                                  "not vectorized: data ref analysis "
4092                                  "failed %G", stmt_info->stmt);
4093               if (is_a <bb_vec_info> (vinfo))
4094                 {
4095                   /* In BB vectorization the ref can still participate
4096                      in dependence analysis, we just can't vectorize it.  */
4097                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4098                   continue;
4099                 }
4100               return opt_result::failure_at (stmt_info->stmt,
4101                                              "not vectorized:"
4102                                              " data ref analysis failed: %G",
4103                                              stmt_info->stmt);
4104             }
4105         }
4106
4107       /* See if this was detected as SIMD lane access.  */
4108       if (dr->aux == (void *)-1)
4109         {
4110           if (nested_in_vect_loop_p (loop, stmt_info))
4111             return opt_result::failure_at (stmt_info->stmt,
4112                                            "not vectorized:"
4113                                            " data ref analysis failed: %G",
4114                                            stmt_info->stmt);
4115           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4116         }
4117
4118       tree base = get_base_address (DR_REF (dr));
4119       if (base && VAR_P (base) && DECL_NONALIASED (base))
4120         {
4121           if (dump_enabled_p ())
4122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4123                              "not vectorized: base object not addressable "
4124                              "for stmt: %G", stmt_info->stmt);
4125           if (is_a <bb_vec_info> (vinfo))
4126             {
4127               /* In BB vectorization the ref can still participate
4128                  in dependence analysis, we just can't vectorize it.  */
4129               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4130               continue;
4131             }
4132           return opt_result::failure_at (stmt_info->stmt,
4133                                          "not vectorized: base object not"
4134                                          " addressable for stmt: %G",
4135                                          stmt_info->stmt);
4136         }
4137
4138       if (is_a <loop_vec_info> (vinfo)
4139           && DR_STEP (dr)
4140           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4141         {
4142           if (nested_in_vect_loop_p (loop, stmt_info))
4143             return opt_result::failure_at (stmt_info->stmt,
4144                                            "not vectorized:"
4145                                            "not suitable for strided load %G",
4146                                            stmt_info->stmt);
4147           STMT_VINFO_STRIDED_P (stmt_info) = true;
4148         }
4149
4150       /* Update DR field in stmt_vec_info struct.  */
4151
4152       /* If the dataref is in an inner-loop of the loop that is considered for
4153          for vectorization, we also want to analyze the access relative to
4154          the outer-loop (DR contains information only relative to the
4155          inner-most enclosing loop).  We do that by building a reference to the
4156          first location accessed by the inner-loop, and analyze it relative to
4157          the outer-loop.  */
4158       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4159         {
4160           /* Build a reference to the first location accessed by the
4161              inner loop: *(BASE + INIT + OFFSET).  By construction,
4162              this address must be invariant in the inner loop, so we
4163              can consider it as being used in the outer loop.  */
4164           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4165           tree offset = unshare_expr (DR_OFFSET (dr));
4166           tree init = unshare_expr (DR_INIT (dr));
4167           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4168                                           init, offset);
4169           tree init_addr = fold_build_pointer_plus (base, init_offset);
4170           tree init_ref = build_fold_indirect_ref (init_addr);
4171
4172           if (dump_enabled_p ())
4173             dump_printf_loc (MSG_NOTE, vect_location,
4174                              "analyze in outer loop: %T\n", init_ref);
4175
4176           opt_result res
4177             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4178                                     init_ref, loop, stmt_info->stmt);
4179           if (!res)
4180             /* dr_analyze_innermost already explained the failure.  */
4181             return res;
4182
4183           if (dump_enabled_p ())
4184             dump_printf_loc (MSG_NOTE, vect_location,
4185                              "\touter base_address: %T\n"
4186                              "\touter offset from base address: %T\n"
4187                              "\touter constant offset from base address: %T\n"
4188                              "\touter step: %T\n"
4189                              "\touter base alignment: %d\n\n"
4190                              "\touter base misalignment: %d\n"
4191                              "\touter offset alignment: %d\n"
4192                              "\touter step alignment: %d\n",
4193                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4194                              STMT_VINFO_DR_OFFSET (stmt_info),
4195                              STMT_VINFO_DR_INIT (stmt_info),
4196                              STMT_VINFO_DR_STEP (stmt_info),
4197                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4198                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4199                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4200                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4201         }
4202
4203       /* Set vectype for STMT.  */
4204       scalar_type = TREE_TYPE (DR_REF (dr));
4205       STMT_VINFO_VECTYPE (stmt_info)
4206         = get_vectype_for_scalar_type (scalar_type);
4207       if (!STMT_VINFO_VECTYPE (stmt_info))
4208         {
4209           if (dump_enabled_p ())
4210             {
4211               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4212                                "not vectorized: no vectype for stmt: %G",
4213                                stmt_info->stmt);
4214               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4215               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4216                                  scalar_type);
4217               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4218             }
4219
4220           if (is_a <bb_vec_info> (vinfo))
4221             {
4222               /* No vector type is fine, the ref can still participate
4223                  in dependence analysis, we just can't vectorize it.  */
4224               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4225               continue;
4226             }
4227           return opt_result::failure_at (stmt_info->stmt,
4228                                          "not vectorized:"
4229                                          " no vectype for stmt: %G"
4230                                          " scalar_type: %T\n",
4231                                          stmt_info->stmt, scalar_type);
4232         }
4233       else
4234         {
4235           if (dump_enabled_p ())
4236             dump_printf_loc (MSG_NOTE, vect_location,
4237                              "got vectype for stmt: %G%T\n",
4238                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4239         }
4240
4241       /* Adjust the minimal vectorization factor according to the
4242          vector type.  */
4243       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4244       *min_vf = upper_bound (*min_vf, vf);
4245
4246       if (gatherscatter != SG_NONE)
4247         {
4248           gather_scatter_info gs_info;
4249           if (!vect_check_gather_scatter (stmt_info,
4250                                           as_a <loop_vec_info> (vinfo),
4251                                           &gs_info)
4252               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4253             return opt_result::failure_at
4254               (stmt_info->stmt,
4255                (gatherscatter == GATHER) ?
4256                "not vectorized: not suitable for gather load %G" :
4257                "not vectorized: not suitable for scatter store %G",
4258                stmt_info->stmt);
4259           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4260         }
4261     }
4262
4263   /* We used to stop processing and prune the list here.  Verify we no
4264      longer need to.  */
4265   gcc_assert (i == datarefs.length ());
4266
4267   return opt_result::success ();
4268 }
4269
4270
4271 /* Function vect_get_new_vect_var.
4272
4273    Returns a name for a new variable.  The current naming scheme appends the
4274    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4275    the name of vectorizer generated variables, and appends that to NAME if
4276    provided.  */
4277
4278 tree
4279 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4280 {
4281   const char *prefix;
4282   tree new_vect_var;
4283
4284   switch (var_kind)
4285   {
4286   case vect_simple_var:
4287     prefix = "vect";
4288     break;
4289   case vect_scalar_var:
4290     prefix = "stmp";
4291     break;
4292   case vect_mask_var:
4293     prefix = "mask";
4294     break;
4295   case vect_pointer_var:
4296     prefix = "vectp";
4297     break;
4298   default:
4299     gcc_unreachable ();
4300   }
4301
4302   if (name)
4303     {
4304       char* tmp = concat (prefix, "_", name, NULL);
4305       new_vect_var = create_tmp_reg (type, tmp);
4306       free (tmp);
4307     }
4308   else
4309     new_vect_var = create_tmp_reg (type, prefix);
4310
4311   return new_vect_var;
4312 }
4313
4314 /* Like vect_get_new_vect_var but return an SSA name.  */
4315
4316 tree
4317 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4318 {
4319   const char *prefix;
4320   tree new_vect_var;
4321
4322   switch (var_kind)
4323   {
4324   case vect_simple_var:
4325     prefix = "vect";
4326     break;
4327   case vect_scalar_var:
4328     prefix = "stmp";
4329     break;
4330   case vect_pointer_var:
4331     prefix = "vectp";
4332     break;
4333   default:
4334     gcc_unreachable ();
4335   }
4336
4337   if (name)
4338     {
4339       char* tmp = concat (prefix, "_", name, NULL);
4340       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4341       free (tmp);
4342     }
4343   else
4344     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4345
4346   return new_vect_var;
4347 }
4348
4349 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4350
4351 static void
4352 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4353 {
4354   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4355   int misalign = DR_MISALIGNMENT (dr_info);
4356   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4357     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4358   else
4359     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4360                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4361 }
4362
4363 /* Function vect_create_addr_base_for_vector_ref.
4364
4365    Create an expression that computes the address of the first memory location
4366    that will be accessed for a data reference.
4367
4368    Input:
4369    STMT_INFO: The statement containing the data reference.
4370    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4371    OFFSET: Optional. If supplied, it is be added to the initial address.
4372    LOOP:    Specify relative to which loop-nest should the address be computed.
4373             For example, when the dataref is in an inner-loop nested in an
4374             outer-loop that is now being vectorized, LOOP can be either the
4375             outer-loop, or the inner-loop.  The first memory location accessed
4376             by the following dataref ('in' points to short):
4377
4378                 for (i=0; i<N; i++)
4379                    for (j=0; j<M; j++)
4380                      s += in[i+j]
4381
4382             is as follows:
4383             if LOOP=i_loop:     &in             (relative to i_loop)
4384             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4385    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4386             initial address.  Unlike OFFSET, which is number of elements to
4387             be added, BYTE_OFFSET is measured in bytes.
4388
4389    Output:
4390    1. Return an SSA_NAME whose value is the address of the memory location of
4391       the first vector of the data reference.
4392    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4393       these statement(s) which define the returned SSA_NAME.
4394
4395    FORNOW: We are only handling array accesses with step 1.  */
4396
4397 tree
4398 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4399                                       gimple_seq *new_stmt_list,
4400                                       tree offset,
4401                                       tree byte_offset)
4402 {
4403   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4404   struct data_reference *dr = dr_info->dr;
4405   const char *base_name;
4406   tree addr_base;
4407   tree dest;
4408   gimple_seq seq = NULL;
4409   tree vect_ptr_type;
4410   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4411   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4412   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4413
4414   tree data_ref_base = unshare_expr (drb->base_address);
4415   tree base_offset = unshare_expr (drb->offset);
4416   tree init = unshare_expr (drb->init);
4417
4418   if (loop_vinfo)
4419     base_name = get_name (data_ref_base);
4420   else
4421     {
4422       base_offset = ssize_int (0);
4423       init = ssize_int (0);
4424       base_name = get_name (DR_REF (dr));
4425     }
4426
4427   /* Create base_offset */
4428   base_offset = size_binop (PLUS_EXPR,
4429                             fold_convert (sizetype, base_offset),
4430                             fold_convert (sizetype, init));
4431
4432   if (offset)
4433     {
4434       offset = fold_build2 (MULT_EXPR, sizetype,
4435                             fold_convert (sizetype, offset), step);
4436       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4437                                  base_offset, offset);
4438     }
4439   if (byte_offset)
4440     {
4441       byte_offset = fold_convert (sizetype, byte_offset);
4442       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4443                                  base_offset, byte_offset);
4444     }
4445
4446   /* base + base_offset */
4447   if (loop_vinfo)
4448     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4449   else
4450     {
4451       addr_base = build1 (ADDR_EXPR,
4452                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4453                           unshare_expr (DR_REF (dr)));
4454     }
4455
4456   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4457   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4458   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4459   gimple_seq_add_seq (new_stmt_list, seq);
4460
4461   if (DR_PTR_INFO (dr)
4462       && TREE_CODE (addr_base) == SSA_NAME
4463       && !SSA_NAME_PTR_INFO (addr_base))
4464     {
4465       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4466       if (offset || byte_offset)
4467         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4468     }
4469
4470   if (dump_enabled_p ())
4471     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4472
4473   return addr_base;
4474 }
4475
4476
4477 /* Function vect_create_data_ref_ptr.
4478
4479    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4480    location accessed in the loop by STMT_INFO, along with the def-use update
4481    chain to appropriately advance the pointer through the loop iterations.
4482    Also set aliasing information for the pointer.  This pointer is used by
4483    the callers to this function to create a memory reference expression for
4484    vector load/store access.
4485
4486    Input:
4487    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4488          GIMPLE_ASSIGN <name, data-ref> or
4489          GIMPLE_ASSIGN <data-ref, name>.
4490    2. AGGR_TYPE: the type of the reference, which should be either a vector
4491         or an array.
4492    3. AT_LOOP: the loop where the vector memref is to be created.
4493    4. OFFSET (optional): an offset to be added to the initial address accessed
4494         by the data-ref in STMT_INFO.
4495    5. BSI: location where the new stmts are to be placed if there is no loop
4496    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4497         pointing to the initial address.
4498    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4499         to the initial address accessed by the data-ref in STMT_INFO.  This is
4500         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4501         in bytes.
4502    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4503         to the IV during each iteration of the loop.  NULL says to move
4504         by one copy of AGGR_TYPE up or down, depending on the step of the
4505         data reference.
4506
4507    Output:
4508    1. Declare a new ptr to vector_type, and have it point to the base of the
4509       data reference (initial addressed accessed by the data reference).
4510       For example, for vector of type V8HI, the following code is generated:
4511
4512       v8hi *ap;
4513       ap = (v8hi *)initial_address;
4514
4515       if OFFSET is not supplied:
4516          initial_address = &a[init];
4517       if OFFSET is supplied:
4518          initial_address = &a[init + OFFSET];
4519       if BYTE_OFFSET is supplied:
4520          initial_address = &a[init] + BYTE_OFFSET;
4521
4522       Return the initial_address in INITIAL_ADDRESS.
4523
4524    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4525       update the pointer in each iteration of the loop.
4526
4527       Return the increment stmt that updates the pointer in PTR_INCR.
4528
4529    3. Return the pointer.  */
4530
4531 tree
4532 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4533                           struct loop *at_loop, tree offset,
4534                           tree *initial_address, gimple_stmt_iterator *gsi,
4535                           gimple **ptr_incr, bool only_init,
4536                           tree byte_offset, tree iv_step)
4537 {
4538   const char *base_name;
4539   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4540   struct loop *loop = NULL;
4541   bool nested_in_vect_loop = false;
4542   struct loop *containing_loop = NULL;
4543   tree aggr_ptr_type;
4544   tree aggr_ptr;
4545   tree new_temp;
4546   gimple_seq new_stmt_list = NULL;
4547   edge pe = NULL;
4548   basic_block new_bb;
4549   tree aggr_ptr_init;
4550   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4551   struct data_reference *dr = dr_info->dr;
4552   tree aptr;
4553   gimple_stmt_iterator incr_gsi;
4554   bool insert_after;
4555   tree indx_before_incr, indx_after_incr;
4556   gimple *incr;
4557   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4558
4559   gcc_assert (iv_step != NULL_TREE
4560               || TREE_CODE (aggr_type) == ARRAY_TYPE
4561               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4562
4563   if (loop_vinfo)
4564     {
4565       loop = LOOP_VINFO_LOOP (loop_vinfo);
4566       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4567       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4568       pe = loop_preheader_edge (loop);
4569     }
4570   else
4571     {
4572       gcc_assert (bb_vinfo);
4573       only_init = true;
4574       *ptr_incr = NULL;
4575     }
4576
4577   /* Create an expression for the first address accessed by this load
4578      in LOOP.  */
4579   base_name = get_name (DR_BASE_ADDRESS (dr));
4580
4581   if (dump_enabled_p ())
4582     {
4583       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4584       dump_printf_loc (MSG_NOTE, vect_location,
4585                        "create %s-pointer variable to type: %T",
4586                        get_tree_code_name (TREE_CODE (aggr_type)),
4587                        aggr_type);
4588       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4589         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4590       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4591         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4592       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4593         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4594       else
4595         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4596       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4597     }
4598
4599   /* (1) Create the new aggregate-pointer variable.
4600      Vector and array types inherit the alias set of their component
4601      type by default so we need to use a ref-all pointer if the data
4602      reference does not conflict with the created aggregated data
4603      reference because it is not addressable.  */
4604   bool need_ref_all = false;
4605   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4606                               get_alias_set (DR_REF (dr))))
4607     need_ref_all = true;
4608   /* Likewise for any of the data references in the stmt group.  */
4609   else if (DR_GROUP_SIZE (stmt_info) > 1)
4610     {
4611       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4612       do
4613         {
4614           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4615           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4616                                       get_alias_set (DR_REF (sdr))))
4617             {
4618               need_ref_all = true;
4619               break;
4620             }
4621           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4622         }
4623       while (sinfo);
4624     }
4625   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4626                                                need_ref_all);
4627   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4628
4629
4630   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4631      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4632      def-use update cycles for the pointer: one relative to the outer-loop
4633      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4634      to the inner-loop (which is the inner-most loop containing the dataref),
4635      and this is done be step (5) below.
4636
4637      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4638      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4639      redundant.  Steps (3),(4) create the following:
4640
4641         vp0 = &base_addr;
4642         LOOP:   vp1 = phi(vp0,vp2)
4643                 ...
4644                 ...
4645                 vp2 = vp1 + step
4646                 goto LOOP
4647
4648      If there is an inner-loop nested in loop, then step (5) will also be
4649      applied, and an additional update in the inner-loop will be created:
4650
4651         vp0 = &base_addr;
4652         LOOP:   vp1 = phi(vp0,vp2)
4653                 ...
4654         inner:     vp3 = phi(vp1,vp4)
4655                    vp4 = vp3 + inner_step
4656                    if () goto inner
4657                 ...
4658                 vp2 = vp1 + step
4659                 if () goto LOOP   */
4660
4661   /* (2) Calculate the initial address of the aggregate-pointer, and set
4662      the aggregate-pointer to point to it before the loop.  */
4663
4664   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4665
4666   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4667                                                    offset, byte_offset);
4668   if (new_stmt_list)
4669     {
4670       if (pe)
4671         {
4672           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4673           gcc_assert (!new_bb);
4674         }
4675       else
4676         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4677     }
4678
4679   *initial_address = new_temp;
4680   aggr_ptr_init = new_temp;
4681
4682   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4683      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4684      inner-loop nested in LOOP (during outer-loop vectorization).  */
4685
4686   /* No update in loop is required.  */
4687   if (only_init && (!loop_vinfo || at_loop == loop))
4688     aptr = aggr_ptr_init;
4689   else
4690     {
4691       /* Accesses to invariant addresses should be handled specially
4692          by the caller.  */
4693       tree step = vect_dr_behavior (dr_info)->step;
4694       gcc_assert (!integer_zerop (step));
4695
4696       if (iv_step == NULL_TREE)
4697         {
4698           /* The step of the aggregate pointer is the type size,
4699              negated for downward accesses.  */
4700           iv_step = TYPE_SIZE_UNIT (aggr_type);
4701           if (tree_int_cst_sgn (step) == -1)
4702             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4703         }
4704
4705       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4706
4707       create_iv (aggr_ptr_init,
4708                  fold_convert (aggr_ptr_type, iv_step),
4709                  aggr_ptr, loop, &incr_gsi, insert_after,
4710                  &indx_before_incr, &indx_after_incr);
4711       incr = gsi_stmt (incr_gsi);
4712       loop_vinfo->add_stmt (incr);
4713
4714       /* Copy the points-to information if it exists. */
4715       if (DR_PTR_INFO (dr))
4716         {
4717           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4718           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4719         }
4720       if (ptr_incr)
4721         *ptr_incr = incr;
4722
4723       aptr = indx_before_incr;
4724     }
4725
4726   if (!nested_in_vect_loop || only_init)
4727     return aptr;
4728
4729
4730   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4731      nested in LOOP, if exists.  */
4732
4733   gcc_assert (nested_in_vect_loop);
4734   if (!only_init)
4735     {
4736       standard_iv_increment_position (containing_loop, &incr_gsi,
4737                                       &insert_after);
4738       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4739                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4740                  &indx_after_incr);
4741       incr = gsi_stmt (incr_gsi);
4742       loop_vinfo->add_stmt (incr);
4743
4744       /* Copy the points-to information if it exists. */
4745       if (DR_PTR_INFO (dr))
4746         {
4747           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4748           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4749         }
4750       if (ptr_incr)
4751         *ptr_incr = incr;
4752
4753       return indx_before_incr;
4754     }
4755   else
4756     gcc_unreachable ();
4757 }
4758
4759
4760 /* Function bump_vector_ptr
4761
4762    Increment a pointer (to a vector type) by vector-size. If requested,
4763    i.e. if PTR-INCR is given, then also connect the new increment stmt
4764    to the existing def-use update-chain of the pointer, by modifying
4765    the PTR_INCR as illustrated below:
4766
4767    The pointer def-use update-chain before this function:
4768                         DATAREF_PTR = phi (p_0, p_2)
4769                         ....
4770         PTR_INCR:       p_2 = DATAREF_PTR + step
4771
4772    The pointer def-use update-chain after this function:
4773                         DATAREF_PTR = phi (p_0, p_2)
4774                         ....
4775                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4776                         ....
4777         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4778
4779    Input:
4780    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4781                  in the loop.
4782    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4783               the loop.  The increment amount across iterations is expected
4784               to be vector_size.
4785    BSI - location where the new update stmt is to be placed.
4786    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4787    BUMP - optional. The offset by which to bump the pointer. If not given,
4788           the offset is assumed to be vector_size.
4789
4790    Output: Return NEW_DATAREF_PTR as illustrated above.
4791
4792 */
4793
4794 tree
4795 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4796                  stmt_vec_info stmt_info, tree bump)
4797 {
4798   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4799   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4800   tree update = TYPE_SIZE_UNIT (vectype);
4801   gassign *incr_stmt;
4802   ssa_op_iter iter;
4803   use_operand_p use_p;
4804   tree new_dataref_ptr;
4805
4806   if (bump)
4807     update = bump;
4808
4809   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4810     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4811   else
4812     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4813   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4814                                    dataref_ptr, update);
4815   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4816
4817   /* Copy the points-to information if it exists. */
4818   if (DR_PTR_INFO (dr))
4819     {
4820       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4821       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4822     }
4823
4824   if (!ptr_incr)
4825     return new_dataref_ptr;
4826
4827   /* Update the vector-pointer's cross-iteration increment.  */
4828   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4829     {
4830       tree use = USE_FROM_PTR (use_p);
4831
4832       if (use == dataref_ptr)
4833         SET_USE (use_p, new_dataref_ptr);
4834       else
4835         gcc_assert (operand_equal_p (use, update, 0));
4836     }
4837
4838   return new_dataref_ptr;
4839 }
4840
4841
4842 /* Copy memory reference info such as base/clique from the SRC reference
4843    to the DEST MEM_REF.  */
4844
4845 void
4846 vect_copy_ref_info (tree dest, tree src)
4847 {
4848   if (TREE_CODE (dest) != MEM_REF)
4849     return;
4850
4851   tree src_base = src;
4852   while (handled_component_p (src_base))
4853     src_base = TREE_OPERAND (src_base, 0);
4854   if (TREE_CODE (src_base) != MEM_REF
4855       && TREE_CODE (src_base) != TARGET_MEM_REF)
4856     return;
4857
4858   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4859   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4860 }
4861
4862
4863 /* Function vect_create_destination_var.
4864
4865    Create a new temporary of type VECTYPE.  */
4866
4867 tree
4868 vect_create_destination_var (tree scalar_dest, tree vectype)
4869 {
4870   tree vec_dest;
4871   const char *name;
4872   char *new_name;
4873   tree type;
4874   enum vect_var_kind kind;
4875
4876   kind = vectype
4877     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4878     ? vect_mask_var
4879     : vect_simple_var
4880     : vect_scalar_var;
4881   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4882
4883   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4884
4885   name = get_name (scalar_dest);
4886   if (name)
4887     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4888   else
4889     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4890   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4891   free (new_name);
4892
4893   return vec_dest;
4894 }
4895
4896 /* Function vect_grouped_store_supported.
4897
4898    Returns TRUE if interleave high and interleave low permutations
4899    are supported, and FALSE otherwise.  */
4900
4901 bool
4902 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4903 {
4904   machine_mode mode = TYPE_MODE (vectype);
4905
4906   /* vect_permute_store_chain requires the group size to be equal to 3 or
4907      be a power of two.  */
4908   if (count != 3 && exact_log2 (count) == -1)
4909     {
4910       if (dump_enabled_p ())
4911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4912                          "the size of the group of accesses"
4913                          " is not a power of 2 or not eqaul to 3\n");
4914       return false;
4915     }
4916
4917   /* Check that the permutation is supported.  */
4918   if (VECTOR_MODE_P (mode))
4919     {
4920       unsigned int i;
4921       if (count == 3)
4922         {
4923           unsigned int j0 = 0, j1 = 0, j2 = 0;
4924           unsigned int i, j;
4925
4926           unsigned int nelt;
4927           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4928             {
4929               if (dump_enabled_p ())
4930                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4931                                  "cannot handle groups of 3 stores for"
4932                                  " variable-length vectors\n");
4933               return false;
4934             }
4935
4936           vec_perm_builder sel (nelt, nelt, 1);
4937           sel.quick_grow (nelt);
4938           vec_perm_indices indices;
4939           for (j = 0; j < 3; j++)
4940             {
4941               int nelt0 = ((3 - j) * nelt) % 3;
4942               int nelt1 = ((3 - j) * nelt + 1) % 3;
4943               int nelt2 = ((3 - j) * nelt + 2) % 3;
4944               for (i = 0; i < nelt; i++)
4945                 {
4946                   if (3 * i + nelt0 < nelt)
4947                     sel[3 * i + nelt0] = j0++;
4948                   if (3 * i + nelt1 < nelt)
4949                     sel[3 * i + nelt1] = nelt + j1++;
4950                   if (3 * i + nelt2 < nelt)
4951                     sel[3 * i + nelt2] = 0;
4952                 }
4953               indices.new_vector (sel, 2, nelt);
4954               if (!can_vec_perm_const_p (mode, indices))
4955                 {
4956                   if (dump_enabled_p ())
4957                     dump_printf (MSG_MISSED_OPTIMIZATION,
4958                                  "permutation op not supported by target.\n");
4959                   return false;
4960                 }
4961
4962               for (i = 0; i < nelt; i++)
4963                 {
4964                   if (3 * i + nelt0 < nelt)
4965                     sel[3 * i + nelt0] = 3 * i + nelt0;
4966                   if (3 * i + nelt1 < nelt)
4967                     sel[3 * i + nelt1] = 3 * i + nelt1;
4968                   if (3 * i + nelt2 < nelt)
4969                     sel[3 * i + nelt2] = nelt + j2++;
4970                 }
4971               indices.new_vector (sel, 2, nelt);
4972               if (!can_vec_perm_const_p (mode, indices))
4973                 {
4974                   if (dump_enabled_p ())
4975                     dump_printf (MSG_MISSED_OPTIMIZATION,
4976                                  "permutation op not supported by target.\n");
4977                   return false;
4978                 }
4979             }
4980           return true;
4981         }
4982       else
4983         {
4984           /* If length is not equal to 3 then only power of 2 is supported.  */
4985           gcc_assert (pow2p_hwi (count));
4986           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4987
4988           /* The encoding has 2 interleaved stepped patterns.  */
4989           vec_perm_builder sel (nelt, 2, 3);
4990           sel.quick_grow (6);
4991           for (i = 0; i < 3; i++)
4992             {
4993               sel[i * 2] = i;
4994               sel[i * 2 + 1] = i + nelt;
4995             }
4996           vec_perm_indices indices (sel, 2, nelt);
4997           if (can_vec_perm_const_p (mode, indices))
4998             {
4999               for (i = 0; i < 6; i++)
5000                 sel[i] += exact_div (nelt, 2);
5001               indices.new_vector (sel, 2, nelt);
5002               if (can_vec_perm_const_p (mode, indices))
5003                 return true;
5004             }
5005         }
5006     }
5007
5008   if (dump_enabled_p ())
5009     dump_printf (MSG_MISSED_OPTIMIZATION,
5010                  "permutation op not supported by target.\n");
5011   return false;
5012 }
5013
5014
5015 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5016    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5017
5018 bool
5019 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5020                             bool masked_p)
5021 {
5022   if (masked_p)
5023     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5024                                          vec_mask_store_lanes_optab,
5025                                          vectype, count);
5026   else
5027     return vect_lanes_optab_supported_p ("vec_store_lanes",
5028                                          vec_store_lanes_optab,
5029                                          vectype, count);
5030 }
5031
5032
5033 /* Function vect_permute_store_chain.
5034
5035    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5036    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5037    the data correctly for the stores.  Return the final references for stores
5038    in RESULT_CHAIN.
5039
5040    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5041    The input is 4 vectors each containing 8 elements.  We assign a number to
5042    each element, the input sequence is:
5043
5044    1st vec:   0  1  2  3  4  5  6  7
5045    2nd vec:   8  9 10 11 12 13 14 15
5046    3rd vec:  16 17 18 19 20 21 22 23
5047    4th vec:  24 25 26 27 28 29 30 31
5048
5049    The output sequence should be:
5050
5051    1st vec:  0  8 16 24  1  9 17 25
5052    2nd vec:  2 10 18 26  3 11 19 27
5053    3rd vec:  4 12 20 28  5 13 21 30
5054    4th vec:  6 14 22 30  7 15 23 31
5055
5056    i.e., we interleave the contents of the four vectors in their order.
5057
5058    We use interleave_high/low instructions to create such output.  The input of
5059    each interleave_high/low operation is two vectors:
5060    1st vec    2nd vec
5061    0 1 2 3    4 5 6 7
5062    the even elements of the result vector are obtained left-to-right from the
5063    high/low elements of the first vector.  The odd elements of the result are
5064    obtained left-to-right from the high/low elements of the second vector.
5065    The output of interleave_high will be:   0 4 1 5
5066    and of interleave_low:                   2 6 3 7
5067
5068
5069    The permutation is done in log LENGTH stages.  In each stage interleave_high
5070    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5071    where the first argument is taken from the first half of DR_CHAIN and the
5072    second argument from it's second half.
5073    In our example,
5074
5075    I1: interleave_high (1st vec, 3rd vec)
5076    I2: interleave_low (1st vec, 3rd vec)
5077    I3: interleave_high (2nd vec, 4th vec)
5078    I4: interleave_low (2nd vec, 4th vec)
5079
5080    The output for the first stage is:
5081
5082    I1:  0 16  1 17  2 18  3 19
5083    I2:  4 20  5 21  6 22  7 23
5084    I3:  8 24  9 25 10 26 11 27
5085    I4: 12 28 13 29 14 30 15 31
5086
5087    The output of the second stage, i.e. the final result is:
5088
5089    I1:  0  8 16 24  1  9 17 25
5090    I2:  2 10 18 26  3 11 19 27
5091    I3:  4 12 20 28  5 13 21 30
5092    I4:  6 14 22 30  7 15 23 31.  */
5093
5094 void
5095 vect_permute_store_chain (vec<tree> dr_chain,
5096                           unsigned int length,
5097                           stmt_vec_info stmt_info,
5098                           gimple_stmt_iterator *gsi,
5099                           vec<tree> *result_chain)
5100 {
5101   tree vect1, vect2, high, low;
5102   gimple *perm_stmt;
5103   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5104   tree perm_mask_low, perm_mask_high;
5105   tree data_ref;
5106   tree perm3_mask_low, perm3_mask_high;
5107   unsigned int i, j, n, log_length = exact_log2 (length);
5108
5109   result_chain->quick_grow (length);
5110   memcpy (result_chain->address (), dr_chain.address (),
5111           length * sizeof (tree));
5112
5113   if (length == 3)
5114     {
5115       /* vect_grouped_store_supported ensures that this is constant.  */
5116       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5117       unsigned int j0 = 0, j1 = 0, j2 = 0;
5118
5119       vec_perm_builder sel (nelt, nelt, 1);
5120       sel.quick_grow (nelt);
5121       vec_perm_indices indices;
5122       for (j = 0; j < 3; j++)
5123         {
5124           int nelt0 = ((3 - j) * nelt) % 3;
5125           int nelt1 = ((3 - j) * nelt + 1) % 3;
5126           int nelt2 = ((3 - j) * nelt + 2) % 3;
5127
5128           for (i = 0; i < nelt; i++)
5129             {
5130               if (3 * i + nelt0 < nelt)
5131                 sel[3 * i + nelt0] = j0++;
5132               if (3 * i + nelt1 < nelt)
5133                 sel[3 * i + nelt1] = nelt + j1++;
5134               if (3 * i + nelt2 < nelt)
5135                 sel[3 * i + nelt2] = 0;
5136             }
5137           indices.new_vector (sel, 2, nelt);
5138           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5139
5140           for (i = 0; i < nelt; i++)
5141             {
5142               if (3 * i + nelt0 < nelt)
5143                 sel[3 * i + nelt0] = 3 * i + nelt0;
5144               if (3 * i + nelt1 < nelt)
5145                 sel[3 * i + nelt1] = 3 * i + nelt1;
5146               if (3 * i + nelt2 < nelt)
5147                 sel[3 * i + nelt2] = nelt + j2++;
5148             }
5149           indices.new_vector (sel, 2, nelt);
5150           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5151
5152           vect1 = dr_chain[0];
5153           vect2 = dr_chain[1];
5154
5155           /* Create interleaving stmt:
5156              low = VEC_PERM_EXPR <vect1, vect2,
5157                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5158                                    j + 2, nelt + j + 2, *, ...}>  */
5159           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5160           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5161                                            vect2, perm3_mask_low);
5162           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5163
5164           vect1 = data_ref;
5165           vect2 = dr_chain[2];
5166           /* Create interleaving stmt:
5167              low = VEC_PERM_EXPR <vect1, vect2,
5168                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5169                                    6, 7, nelt + j + 2, ...}>  */
5170           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5171           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5172                                            vect2, perm3_mask_high);
5173           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5174           (*result_chain)[j] = data_ref;
5175         }
5176     }
5177   else
5178     {
5179       /* If length is not equal to 3 then only power of 2 is supported.  */
5180       gcc_assert (pow2p_hwi (length));
5181
5182       /* The encoding has 2 interleaved stepped patterns.  */
5183       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5184       vec_perm_builder sel (nelt, 2, 3);
5185       sel.quick_grow (6);
5186       for (i = 0; i < 3; i++)
5187         {
5188           sel[i * 2] = i;
5189           sel[i * 2 + 1] = i + nelt;
5190         }
5191         vec_perm_indices indices (sel, 2, nelt);
5192         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5193
5194         for (i = 0; i < 6; i++)
5195           sel[i] += exact_div (nelt, 2);
5196         indices.new_vector (sel, 2, nelt);
5197         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5198
5199         for (i = 0, n = log_length; i < n; i++)
5200           {
5201             for (j = 0; j < length/2; j++)
5202               {
5203                 vect1 = dr_chain[j];
5204                 vect2 = dr_chain[j+length/2];
5205
5206                 /* Create interleaving stmt:
5207                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5208                                                         ...}>  */
5209                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5210                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5211                                                  vect2, perm_mask_high);
5212                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5213                 (*result_chain)[2*j] = high;
5214
5215                 /* Create interleaving stmt:
5216                    low = VEC_PERM_EXPR <vect1, vect2,
5217                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5218                                          ...}>  */
5219                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5220                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5221                                                  vect2, perm_mask_low);
5222                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5223                 (*result_chain)[2*j+1] = low;
5224               }
5225             memcpy (dr_chain.address (), result_chain->address (),
5226                     length * sizeof (tree));
5227           }
5228     }
5229 }
5230
5231 /* Function vect_setup_realignment
5232
5233    This function is called when vectorizing an unaligned load using
5234    the dr_explicit_realign[_optimized] scheme.
5235    This function generates the following code at the loop prolog:
5236
5237       p = initial_addr;
5238    x  msq_init = *(floor(p));   # prolog load
5239       realignment_token = call target_builtin;
5240     loop:
5241    x  msq = phi (msq_init, ---)
5242
5243    The stmts marked with x are generated only for the case of
5244    dr_explicit_realign_optimized.
5245
5246    The code above sets up a new (vector) pointer, pointing to the first
5247    location accessed by STMT_INFO, and a "floor-aligned" load using that
5248    pointer.  It also generates code to compute the "realignment-token"
5249    (if the relevant target hook was defined), and creates a phi-node at the
5250    loop-header bb whose arguments are the result of the prolog-load (created
5251    by this function) and the result of a load that takes place in the loop
5252    (to be created by the caller to this function).
5253
5254    For the case of dr_explicit_realign_optimized:
5255    The caller to this function uses the phi-result (msq) to create the
5256    realignment code inside the loop, and sets up the missing phi argument,
5257    as follows:
5258     loop:
5259       msq = phi (msq_init, lsq)
5260       lsq = *(floor(p'));        # load in loop
5261       result = realign_load (msq, lsq, realignment_token);
5262
5263    For the case of dr_explicit_realign:
5264     loop:
5265       msq = *(floor(p));        # load in loop
5266       p' = p + (VS-1);
5267       lsq = *(floor(p'));       # load in loop
5268       result = realign_load (msq, lsq, realignment_token);
5269
5270    Input:
5271    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5272                a memory location that may be unaligned.
5273    BSI - place where new code is to be inserted.
5274    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5275                               is used.
5276
5277    Output:
5278    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5279                        target hook, if defined.
5280    Return value - the result of the loop-header phi node.  */
5281
5282 tree
5283 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5284                         tree *realignment_token,
5285                         enum dr_alignment_support alignment_support_scheme,
5286                         tree init_addr,
5287                         struct loop **at_loop)
5288 {
5289   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5290   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5291   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5292   struct data_reference *dr = dr_info->dr;
5293   struct loop *loop = NULL;
5294   edge pe = NULL;
5295   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5296   tree vec_dest;
5297   gimple *inc;
5298   tree ptr;
5299   tree data_ref;
5300   basic_block new_bb;
5301   tree msq_init = NULL_TREE;
5302   tree new_temp;
5303   gphi *phi_stmt;
5304   tree msq = NULL_TREE;
5305   gimple_seq stmts = NULL;
5306   bool compute_in_loop = false;
5307   bool nested_in_vect_loop = false;
5308   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5309   struct loop *loop_for_initial_load = NULL;
5310
5311   if (loop_vinfo)
5312     {
5313       loop = LOOP_VINFO_LOOP (loop_vinfo);
5314       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5315     }
5316
5317   gcc_assert (alignment_support_scheme == dr_explicit_realign
5318               || alignment_support_scheme == dr_explicit_realign_optimized);
5319
5320   /* We need to generate three things:
5321      1. the misalignment computation
5322      2. the extra vector load (for the optimized realignment scheme).
5323      3. the phi node for the two vectors from which the realignment is
5324       done (for the optimized realignment scheme).  */
5325
5326   /* 1. Determine where to generate the misalignment computation.
5327
5328      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5329      calculation will be generated by this function, outside the loop (in the
5330      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5331      caller, inside the loop.
5332
5333      Background: If the misalignment remains fixed throughout the iterations of
5334      the loop, then both realignment schemes are applicable, and also the
5335      misalignment computation can be done outside LOOP.  This is because we are
5336      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5337      are a multiple of VS (the Vector Size), and therefore the misalignment in
5338      different vectorized LOOP iterations is always the same.
5339      The problem arises only if the memory access is in an inner-loop nested
5340      inside LOOP, which is now being vectorized using outer-loop vectorization.
5341      This is the only case when the misalignment of the memory access may not
5342      remain fixed throughout the iterations of the inner-loop (as explained in
5343      detail in vect_supportable_dr_alignment).  In this case, not only is the
5344      optimized realignment scheme not applicable, but also the misalignment
5345      computation (and generation of the realignment token that is passed to
5346      REALIGN_LOAD) have to be done inside the loop.
5347
5348      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5349      or not, which in turn determines if the misalignment is computed inside
5350      the inner-loop, or outside LOOP.  */
5351
5352   if (init_addr != NULL_TREE || !loop_vinfo)
5353     {
5354       compute_in_loop = true;
5355       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5356     }
5357
5358
5359   /* 2. Determine where to generate the extra vector load.
5360
5361      For the optimized realignment scheme, instead of generating two vector
5362      loads in each iteration, we generate a single extra vector load in the
5363      preheader of the loop, and in each iteration reuse the result of the
5364      vector load from the previous iteration.  In case the memory access is in
5365      an inner-loop nested inside LOOP, which is now being vectorized using
5366      outer-loop vectorization, we need to determine whether this initial vector
5367      load should be generated at the preheader of the inner-loop, or can be
5368      generated at the preheader of LOOP.  If the memory access has no evolution
5369      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5370      to be generated inside LOOP (in the preheader of the inner-loop).  */
5371
5372   if (nested_in_vect_loop)
5373     {
5374       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5375       bool invariant_in_outerloop =
5376             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5377       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5378     }
5379   else
5380     loop_for_initial_load = loop;
5381   if (at_loop)
5382     *at_loop = loop_for_initial_load;
5383
5384   if (loop_for_initial_load)
5385     pe = loop_preheader_edge (loop_for_initial_load);
5386
5387   /* 3. For the case of the optimized realignment, create the first vector
5388       load at the loop preheader.  */
5389
5390   if (alignment_support_scheme == dr_explicit_realign_optimized)
5391     {
5392       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5393       gassign *new_stmt;
5394
5395       gcc_assert (!compute_in_loop);
5396       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5397       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5398                                       loop_for_initial_load, NULL_TREE,
5399                                       &init_addr, NULL, &inc, true);
5400       if (TREE_CODE (ptr) == SSA_NAME)
5401         new_temp = copy_ssa_name (ptr);
5402       else
5403         new_temp = make_ssa_name (TREE_TYPE (ptr));
5404       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5405       new_stmt = gimple_build_assign
5406                    (new_temp, BIT_AND_EXPR, ptr,
5407                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5408       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5409       gcc_assert (!new_bb);
5410       data_ref
5411         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5412                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5413       vect_copy_ref_info (data_ref, DR_REF (dr));
5414       new_stmt = gimple_build_assign (vec_dest, data_ref);
5415       new_temp = make_ssa_name (vec_dest, new_stmt);
5416       gimple_assign_set_lhs (new_stmt, new_temp);
5417       if (pe)
5418         {
5419           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5420           gcc_assert (!new_bb);
5421         }
5422       else
5423          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5424
5425       msq_init = gimple_assign_lhs (new_stmt);
5426     }
5427
5428   /* 4. Create realignment token using a target builtin, if available.
5429       It is done either inside the containing loop, or before LOOP (as
5430       determined above).  */
5431
5432   if (targetm.vectorize.builtin_mask_for_load)
5433     {
5434       gcall *new_stmt;
5435       tree builtin_decl;
5436
5437       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5438       if (!init_addr)
5439         {
5440           /* Generate the INIT_ADDR computation outside LOOP.  */
5441           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5442                                                             NULL_TREE);
5443           if (loop)
5444             {
5445               pe = loop_preheader_edge (loop);
5446               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5447               gcc_assert (!new_bb);
5448             }
5449           else
5450              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5451         }
5452
5453       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5454       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5455       vec_dest =
5456         vect_create_destination_var (scalar_dest,
5457                                      gimple_call_return_type (new_stmt));
5458       new_temp = make_ssa_name (vec_dest, new_stmt);
5459       gimple_call_set_lhs (new_stmt, new_temp);
5460
5461       if (compute_in_loop)
5462         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5463       else
5464         {
5465           /* Generate the misalignment computation outside LOOP.  */
5466           pe = loop_preheader_edge (loop);
5467           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5468           gcc_assert (!new_bb);
5469         }
5470
5471       *realignment_token = gimple_call_lhs (new_stmt);
5472
5473       /* The result of the CALL_EXPR to this builtin is determined from
5474          the value of the parameter and no global variables are touched
5475          which makes the builtin a "const" function.  Requiring the
5476          builtin to have the "const" attribute makes it unnecessary
5477          to call mark_call_clobbered.  */
5478       gcc_assert (TREE_READONLY (builtin_decl));
5479     }
5480
5481   if (alignment_support_scheme == dr_explicit_realign)
5482     return msq;
5483
5484   gcc_assert (!compute_in_loop);
5485   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5486
5487
5488   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5489
5490   pe = loop_preheader_edge (containing_loop);
5491   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5492   msq = make_ssa_name (vec_dest);
5493   phi_stmt = create_phi_node (msq, containing_loop->header);
5494   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5495
5496   return msq;
5497 }
5498
5499
5500 /* Function vect_grouped_load_supported.
5501
5502    COUNT is the size of the load group (the number of statements plus the
5503    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5504    only one statement, with a gap of COUNT - 1.
5505
5506    Returns true if a suitable permute exists.  */
5507
5508 bool
5509 vect_grouped_load_supported (tree vectype, bool single_element_p,
5510                              unsigned HOST_WIDE_INT count)
5511 {
5512   machine_mode mode = TYPE_MODE (vectype);
5513
5514   /* If this is single-element interleaving with an element distance
5515      that leaves unused vector loads around punt - we at least create
5516      very sub-optimal code in that case (and blow up memory,
5517      see PR65518).  */
5518   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5519     {
5520       if (dump_enabled_p ())
5521         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522                          "single-element interleaving not supported "
5523                          "for not adjacent vector loads\n");
5524       return false;
5525     }
5526
5527   /* vect_permute_load_chain requires the group size to be equal to 3 or
5528      be a power of two.  */
5529   if (count != 3 && exact_log2 (count) == -1)
5530     {
5531       if (dump_enabled_p ())
5532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533                          "the size of the group of accesses"
5534                          " is not a power of 2 or not equal to 3\n");
5535       return false;
5536     }
5537
5538   /* Check that the permutation is supported.  */
5539   if (VECTOR_MODE_P (mode))
5540     {
5541       unsigned int i, j;
5542       if (count == 3)
5543         {
5544           unsigned int nelt;
5545           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5546             {
5547               if (dump_enabled_p ())
5548                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5549                                  "cannot handle groups of 3 loads for"
5550                                  " variable-length vectors\n");
5551               return false;
5552             }
5553
5554           vec_perm_builder sel (nelt, nelt, 1);
5555           sel.quick_grow (nelt);
5556           vec_perm_indices indices;
5557           unsigned int k;
5558           for (k = 0; k < 3; k++)
5559             {
5560               for (i = 0; i < nelt; i++)
5561                 if (3 * i + k < 2 * nelt)
5562                   sel[i] = 3 * i + k;
5563                 else
5564                   sel[i] = 0;
5565               indices.new_vector (sel, 2, nelt);
5566               if (!can_vec_perm_const_p (mode, indices))
5567                 {
5568                   if (dump_enabled_p ())
5569                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5570                                      "shuffle of 3 loads is not supported by"
5571                                      " target\n");
5572                   return false;
5573                 }
5574               for (i = 0, j = 0; i < nelt; i++)
5575                 if (3 * i + k < 2 * nelt)
5576                   sel[i] = i;
5577                 else
5578                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5579               indices.new_vector (sel, 2, nelt);
5580               if (!can_vec_perm_const_p (mode, indices))
5581                 {
5582                   if (dump_enabled_p ())
5583                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5584                                      "shuffle of 3 loads is not supported by"
5585                                      " target\n");
5586                   return false;
5587                 }
5588             }
5589           return true;
5590         }
5591       else
5592         {
5593           /* If length is not equal to 3 then only power of 2 is supported.  */
5594           gcc_assert (pow2p_hwi (count));
5595           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5596
5597           /* The encoding has a single stepped pattern.  */
5598           vec_perm_builder sel (nelt, 1, 3);
5599           sel.quick_grow (3);
5600           for (i = 0; i < 3; i++)
5601             sel[i] = i * 2;
5602           vec_perm_indices indices (sel, 2, nelt);
5603           if (can_vec_perm_const_p (mode, indices))
5604             {
5605               for (i = 0; i < 3; i++)
5606                 sel[i] = i * 2 + 1;
5607               indices.new_vector (sel, 2, nelt);
5608               if (can_vec_perm_const_p (mode, indices))
5609                 return true;
5610             }
5611         }
5612     }
5613
5614   if (dump_enabled_p ())
5615     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616                      "extract even/odd not supported by target\n");
5617   return false;
5618 }
5619
5620 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5621    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5622
5623 bool
5624 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5625                            bool masked_p)
5626 {
5627   if (masked_p)
5628     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5629                                          vec_mask_load_lanes_optab,
5630                                          vectype, count);
5631   else
5632     return vect_lanes_optab_supported_p ("vec_load_lanes",
5633                                          vec_load_lanes_optab,
5634                                          vectype, count);
5635 }
5636
5637 /* Function vect_permute_load_chain.
5638
5639    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5640    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5641    the input data correctly.  Return the final references for loads in
5642    RESULT_CHAIN.
5643
5644    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5645    The input is 4 vectors each containing 8 elements. We assign a number to each
5646    element, the input sequence is:
5647
5648    1st vec:   0  1  2  3  4  5  6  7
5649    2nd vec:   8  9 10 11 12 13 14 15
5650    3rd vec:  16 17 18 19 20 21 22 23
5651    4th vec:  24 25 26 27 28 29 30 31
5652
5653    The output sequence should be:
5654
5655    1st vec:  0 4  8 12 16 20 24 28
5656    2nd vec:  1 5  9 13 17 21 25 29
5657    3rd vec:  2 6 10 14 18 22 26 30
5658    4th vec:  3 7 11 15 19 23 27 31
5659
5660    i.e., the first output vector should contain the first elements of each
5661    interleaving group, etc.
5662
5663    We use extract_even/odd instructions to create such output.  The input of
5664    each extract_even/odd operation is two vectors
5665    1st vec    2nd vec
5666    0 1 2 3    4 5 6 7
5667
5668    and the output is the vector of extracted even/odd elements.  The output of
5669    extract_even will be:   0 2 4 6
5670    and of extract_odd:     1 3 5 7
5671
5672
5673    The permutation is done in log LENGTH stages.  In each stage extract_even
5674    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5675    their order.  In our example,
5676
5677    E1: extract_even (1st vec, 2nd vec)
5678    E2: extract_odd (1st vec, 2nd vec)
5679    E3: extract_even (3rd vec, 4th vec)
5680    E4: extract_odd (3rd vec, 4th vec)
5681
5682    The output for the first stage will be:
5683
5684    E1:  0  2  4  6  8 10 12 14
5685    E2:  1  3  5  7  9 11 13 15
5686    E3: 16 18 20 22 24 26 28 30
5687    E4: 17 19 21 23 25 27 29 31
5688
5689    In order to proceed and create the correct sequence for the next stage (or
5690    for the correct output, if the second stage is the last one, as in our
5691    example), we first put the output of extract_even operation and then the
5692    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5693    The input for the second stage is:
5694
5695    1st vec (E1):  0  2  4  6  8 10 12 14
5696    2nd vec (E3): 16 18 20 22 24 26 28 30
5697    3rd vec (E2):  1  3  5  7  9 11 13 15
5698    4th vec (E4): 17 19 21 23 25 27 29 31
5699
5700    The output of the second stage:
5701
5702    E1: 0 4  8 12 16 20 24 28
5703    E2: 2 6 10 14 18 22 26 30
5704    E3: 1 5  9 13 17 21 25 29
5705    E4: 3 7 11 15 19 23 27 31
5706
5707    And RESULT_CHAIN after reordering:
5708
5709    1st vec (E1):  0 4  8 12 16 20 24 28
5710    2nd vec (E3):  1 5  9 13 17 21 25 29
5711    3rd vec (E2):  2 6 10 14 18 22 26 30
5712    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5713
5714 static void
5715 vect_permute_load_chain (vec<tree> dr_chain,
5716                          unsigned int length,
5717                          stmt_vec_info stmt_info,
5718                          gimple_stmt_iterator *gsi,
5719                          vec<tree> *result_chain)
5720 {
5721   tree data_ref, first_vect, second_vect;
5722   tree perm_mask_even, perm_mask_odd;
5723   tree perm3_mask_low, perm3_mask_high;
5724   gimple *perm_stmt;
5725   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5726   unsigned int i, j, log_length = exact_log2 (length);
5727
5728   result_chain->quick_grow (length);
5729   memcpy (result_chain->address (), dr_chain.address (),
5730           length * sizeof (tree));
5731
5732   if (length == 3)
5733     {
5734       /* vect_grouped_load_supported ensures that this is constant.  */
5735       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5736       unsigned int k;
5737
5738       vec_perm_builder sel (nelt, nelt, 1);
5739       sel.quick_grow (nelt);
5740       vec_perm_indices indices;
5741       for (k = 0; k < 3; k++)
5742         {
5743           for (i = 0; i < nelt; i++)
5744             if (3 * i + k < 2 * nelt)
5745               sel[i] = 3 * i + k;
5746             else
5747               sel[i] = 0;
5748           indices.new_vector (sel, 2, nelt);
5749           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5750
5751           for (i = 0, j = 0; i < nelt; i++)
5752             if (3 * i + k < 2 * nelt)
5753               sel[i] = i;
5754             else
5755               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5756           indices.new_vector (sel, 2, nelt);
5757           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5758
5759           first_vect = dr_chain[0];
5760           second_vect = dr_chain[1];
5761
5762           /* Create interleaving stmt (low part of):
5763              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5764                                                              ...}>  */
5765           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5766           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5767                                            second_vect, perm3_mask_low);
5768           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5769
5770           /* Create interleaving stmt (high part of):
5771              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5772                                                               ...}>  */
5773           first_vect = data_ref;
5774           second_vect = dr_chain[2];
5775           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5776           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5777                                            second_vect, perm3_mask_high);
5778           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5779           (*result_chain)[k] = data_ref;
5780         }
5781     }
5782   else
5783     {
5784       /* If length is not equal to 3 then only power of 2 is supported.  */
5785       gcc_assert (pow2p_hwi (length));
5786
5787       /* The encoding has a single stepped pattern.  */
5788       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5789       vec_perm_builder sel (nelt, 1, 3);
5790       sel.quick_grow (3);
5791       for (i = 0; i < 3; ++i)
5792         sel[i] = i * 2;
5793       vec_perm_indices indices (sel, 2, nelt);
5794       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5795
5796       for (i = 0; i < 3; ++i)
5797         sel[i] = i * 2 + 1;
5798       indices.new_vector (sel, 2, nelt);
5799       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5800
5801       for (i = 0; i < log_length; i++)
5802         {
5803           for (j = 0; j < length; j += 2)
5804             {
5805               first_vect = dr_chain[j];
5806               second_vect = dr_chain[j+1];
5807
5808               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5809               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5810               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5811                                                first_vect, second_vect,
5812                                                perm_mask_even);
5813               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5814               (*result_chain)[j/2] = data_ref;
5815
5816               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5817               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5818               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5819                                                first_vect, second_vect,
5820                                                perm_mask_odd);
5821               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5822               (*result_chain)[j/2+length/2] = data_ref;
5823             }
5824           memcpy (dr_chain.address (), result_chain->address (),
5825                   length * sizeof (tree));
5826         }
5827     }
5828 }
5829
5830 /* Function vect_shift_permute_load_chain.
5831
5832    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5833    sequence of stmts to reorder the input data accordingly.
5834    Return the final references for loads in RESULT_CHAIN.
5835    Return true if successed, false otherwise.
5836
5837    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5838    The input is 3 vectors each containing 8 elements.  We assign a
5839    number to each element, the input sequence is:
5840
5841    1st vec:   0  1  2  3  4  5  6  7
5842    2nd vec:   8  9 10 11 12 13 14 15
5843    3rd vec:  16 17 18 19 20 21 22 23
5844
5845    The output sequence should be:
5846
5847    1st vec:  0 3 6  9 12 15 18 21
5848    2nd vec:  1 4 7 10 13 16 19 22
5849    3rd vec:  2 5 8 11 14 17 20 23
5850
5851    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5852
5853    First we shuffle all 3 vectors to get correct elements order:
5854
5855    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5856    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5857    3rd vec:  (16 19 22) (17 20 23) (18 21)
5858
5859    Next we unite and shift vector 3 times:
5860
5861    1st step:
5862      shift right by 6 the concatenation of:
5863      "1st vec" and  "2nd vec"
5864        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5865      "2nd vec" and  "3rd vec"
5866        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5867      "3rd vec" and  "1st vec"
5868        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5869                              | New vectors                   |
5870
5871      So that now new vectors are:
5872
5873      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5874      2nd vec:  (10 13) (16 19 22) (17 20 23)
5875      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5876
5877    2nd step:
5878      shift right by 5 the concatenation of:
5879      "1st vec" and  "3rd vec"
5880        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5881      "2nd vec" and  "1st vec"
5882        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5883      "3rd vec" and  "2nd vec"
5884        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5885                           | New vectors                   |
5886
5887      So that now new vectors are:
5888
5889      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5890      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5891      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5892
5893    3rd step:
5894      shift right by 5 the concatenation of:
5895      "1st vec" and  "1st vec"
5896        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5897      shift right by 3 the concatenation of:
5898      "2nd vec" and  "2nd vec"
5899                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5900                           | New vectors                   |
5901
5902      So that now all vectors are READY:
5903      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5904      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5905      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5906
5907    This algorithm is faster than one in vect_permute_load_chain if:
5908      1.  "shift of a concatination" is faster than general permutation.
5909          This is usually so.
5910      2.  The TARGET machine can't execute vector instructions in parallel.
5911          This is because each step of the algorithm depends on previous.
5912          The algorithm in vect_permute_load_chain is much more parallel.
5913
5914    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5915 */
5916
5917 static bool
5918 vect_shift_permute_load_chain (vec<tree> dr_chain,
5919                                unsigned int length,
5920                                stmt_vec_info stmt_info,
5921                                gimple_stmt_iterator *gsi,
5922                                vec<tree> *result_chain)
5923 {
5924   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5925   tree perm2_mask1, perm2_mask2, perm3_mask;
5926   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5927   gimple *perm_stmt;
5928
5929   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5930   unsigned int i;
5931   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5932
5933   unsigned HOST_WIDE_INT nelt, vf;
5934   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5935       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5936     /* Not supported for variable-length vectors.  */
5937     return false;
5938
5939   vec_perm_builder sel (nelt, nelt, 1);
5940   sel.quick_grow (nelt);
5941
5942   result_chain->quick_grow (length);
5943   memcpy (result_chain->address (), dr_chain.address (),
5944           length * sizeof (tree));
5945
5946   if (pow2p_hwi (length) && vf > 4)
5947     {
5948       unsigned int j, log_length = exact_log2 (length);
5949       for (i = 0; i < nelt / 2; ++i)
5950         sel[i] = i * 2;
5951       for (i = 0; i < nelt / 2; ++i)
5952         sel[nelt / 2 + i] = i * 2 + 1;
5953       vec_perm_indices indices (sel, 2, nelt);
5954       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5955         {
5956           if (dump_enabled_p ())
5957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5958                              "shuffle of 2 fields structure is not \
5959                               supported by target\n");
5960           return false;
5961         }
5962       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5963
5964       for (i = 0; i < nelt / 2; ++i)
5965         sel[i] = i * 2 + 1;
5966       for (i = 0; i < nelt / 2; ++i)
5967         sel[nelt / 2 + i] = i * 2;
5968       indices.new_vector (sel, 2, nelt);
5969       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5970         {
5971           if (dump_enabled_p ())
5972             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5973                              "shuffle of 2 fields structure is not \
5974                               supported by target\n");
5975           return false;
5976         }
5977       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5978
5979       /* Generating permutation constant to shift all elements.
5980          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5981       for (i = 0; i < nelt; i++)
5982         sel[i] = nelt / 2 + i;
5983       indices.new_vector (sel, 2, nelt);
5984       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5985         {
5986           if (dump_enabled_p ())
5987             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988                              "shift permutation is not supported by target\n");
5989           return false;
5990         }
5991       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5992
5993       /* Generating permutation constant to select vector from 2.
5994          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5995       for (i = 0; i < nelt / 2; i++)
5996         sel[i] = i;
5997       for (i = nelt / 2; i < nelt; i++)
5998         sel[i] = nelt + i;
5999       indices.new_vector (sel, 2, nelt);
6000       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6001         {
6002           if (dump_enabled_p ())
6003             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004                              "select is not supported by target\n");
6005           return false;
6006         }
6007       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6008
6009       for (i = 0; i < log_length; i++)
6010         {
6011           for (j = 0; j < length; j += 2)
6012             {
6013               first_vect = dr_chain[j];
6014               second_vect = dr_chain[j + 1];
6015
6016               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6017               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6018                                                first_vect, first_vect,
6019                                                perm2_mask1);
6020               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6021               vect[0] = data_ref;
6022
6023               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6024               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6025                                                second_vect, second_vect,
6026                                                perm2_mask2);
6027               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6028               vect[1] = data_ref;
6029
6030               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6031               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6032                                                vect[0], vect[1], shift1_mask);
6033               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6034               (*result_chain)[j/2 + length/2] = data_ref;
6035
6036               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6037               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6038                                                vect[0], vect[1], select_mask);
6039               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6040               (*result_chain)[j/2] = data_ref;
6041             }
6042           memcpy (dr_chain.address (), result_chain->address (),
6043                   length * sizeof (tree));
6044         }
6045       return true;
6046     }
6047   if (length == 3 && vf > 2)
6048     {
6049       unsigned int k = 0, l = 0;
6050
6051       /* Generating permutation constant to get all elements in rigth order.
6052          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6053       for (i = 0; i < nelt; i++)
6054         {
6055           if (3 * k + (l % 3) >= nelt)
6056             {
6057               k = 0;
6058               l += (3 - (nelt % 3));
6059             }
6060           sel[i] = 3 * k + (l % 3);
6061           k++;
6062         }
6063       vec_perm_indices indices (sel, 2, nelt);
6064       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6065         {
6066           if (dump_enabled_p ())
6067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6068                              "shuffle of 3 fields structure is not \
6069                               supported by target\n");
6070           return false;
6071         }
6072       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6073
6074       /* Generating permutation constant to shift all elements.
6075          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6076       for (i = 0; i < nelt; i++)
6077         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6078       indices.new_vector (sel, 2, nelt);
6079       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6080         {
6081           if (dump_enabled_p ())
6082             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6083                              "shift permutation is not supported by target\n");
6084           return false;
6085         }
6086       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6087
6088       /* Generating permutation constant to shift all elements.
6089          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6090       for (i = 0; i < nelt; i++)
6091         sel[i] = 2 * (nelt / 3) + 1 + i;
6092       indices.new_vector (sel, 2, nelt);
6093       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6094         {
6095           if (dump_enabled_p ())
6096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097                              "shift permutation is not supported by target\n");
6098           return false;
6099         }
6100       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6101
6102       /* Generating permutation constant to shift all elements.
6103          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6104       for (i = 0; i < nelt; i++)
6105         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6106       indices.new_vector (sel, 2, nelt);
6107       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6108         {
6109           if (dump_enabled_p ())
6110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6111                              "shift permutation is not supported by target\n");
6112           return false;
6113         }
6114       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6115
6116       /* Generating permutation constant to shift all elements.
6117          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6118       for (i = 0; i < nelt; i++)
6119         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6120       indices.new_vector (sel, 2, nelt);
6121       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6122         {
6123           if (dump_enabled_p ())
6124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6125                              "shift permutation is not supported by target\n");
6126           return false;
6127         }
6128       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6129
6130       for (k = 0; k < 3; k++)
6131         {
6132           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6133           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6134                                            dr_chain[k], dr_chain[k],
6135                                            perm3_mask);
6136           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6137           vect[k] = data_ref;
6138         }
6139
6140       for (k = 0; k < 3; k++)
6141         {
6142           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6143           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6144                                            vect[k % 3], vect[(k + 1) % 3],
6145                                            shift1_mask);
6146           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6147           vect_shift[k] = data_ref;
6148         }
6149
6150       for (k = 0; k < 3; k++)
6151         {
6152           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6153           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6154                                            vect_shift[(4 - k) % 3],
6155                                            vect_shift[(3 - k) % 3],
6156                                            shift2_mask);
6157           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6158           vect[k] = data_ref;
6159         }
6160
6161       (*result_chain)[3 - (nelt % 3)] = vect[2];
6162
6163       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6164       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6165                                        vect[0], shift3_mask);
6166       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6167       (*result_chain)[nelt % 3] = data_ref;
6168
6169       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6170       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6171                                        vect[1], shift4_mask);
6172       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6173       (*result_chain)[0] = data_ref;
6174       return true;
6175     }
6176   return false;
6177 }
6178
6179 /* Function vect_transform_grouped_load.
6180
6181    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6182    to perform their permutation and ascribe the result vectorized statements to
6183    the scalar statements.
6184 */
6185
6186 void
6187 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6188                              int size, gimple_stmt_iterator *gsi)
6189 {
6190   machine_mode mode;
6191   vec<tree> result_chain = vNULL;
6192
6193   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6194      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6195      vectors, that are ready for vector computation.  */
6196   result_chain.create (size);
6197
6198   /* If reassociation width for vector type is 2 or greater target machine can
6199      execute 2 or more vector instructions in parallel.  Otherwise try to
6200      get chain for loads group using vect_shift_permute_load_chain.  */
6201   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6202   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6203       || pow2p_hwi (size)
6204       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6205                                          gsi, &result_chain))
6206     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6207   vect_record_grouped_load_vectors (stmt_info, result_chain);
6208   result_chain.release ();
6209 }
6210
6211 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6212    generated as part of the vectorization of STMT_INFO.  Assign the statement
6213    for each vector to the associated scalar statement.  */
6214
6215 void
6216 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6217                                   vec<tree> result_chain)
6218 {
6219   vec_info *vinfo = stmt_info->vinfo;
6220   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6221   unsigned int i, gap_count;
6222   tree tmp_data_ref;
6223
6224   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6225      Since we scan the chain starting from it's first node, their order
6226      corresponds the order of data-refs in RESULT_CHAIN.  */
6227   stmt_vec_info next_stmt_info = first_stmt_info;
6228   gap_count = 1;
6229   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6230     {
6231       if (!next_stmt_info)
6232         break;
6233
6234       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6235        code elimination pass later.  No need to check for the first stmt in
6236        the group, since it always exists.
6237        DR_GROUP_GAP is the number of steps in elements from the previous
6238        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6239        correspond to the gaps.  */
6240       if (next_stmt_info != first_stmt_info
6241           && gap_count < DR_GROUP_GAP (next_stmt_info))
6242       {
6243         gap_count++;
6244         continue;
6245       }
6246
6247       while (next_stmt_info)
6248         {
6249           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6250           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6251              copies, and we put the new vector statement in the first available
6252              RELATED_STMT.  */
6253           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6254             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6255           else
6256             {
6257               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6258                 {
6259                   stmt_vec_info prev_stmt_info
6260                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6261                   stmt_vec_info rel_stmt_info
6262                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6263                   while (rel_stmt_info)
6264                     {
6265                       prev_stmt_info = rel_stmt_info;
6266                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6267                     }
6268
6269                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6270                 }
6271             }
6272
6273           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6274           gap_count = 1;
6275           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6276              put the same TMP_DATA_REF as its vectorized statement; otherwise
6277              get the next data-ref from RESULT_CHAIN.  */
6278           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6279             break;
6280         }
6281     }
6282 }
6283
6284 /* Function vect_force_dr_alignment_p.
6285
6286    Returns whether the alignment of a DECL can be forced to be aligned
6287    on ALIGNMENT bit boundary.  */
6288
6289 bool
6290 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6291 {
6292   if (!VAR_P (decl))
6293     return false;
6294
6295   if (decl_in_symtab_p (decl)
6296       && !symtab_node::get (decl)->can_increase_alignment_p ())
6297     return false;
6298
6299   if (TREE_STATIC (decl))
6300     return (alignment <= MAX_OFILE_ALIGNMENT);
6301   else
6302     return (alignment <= MAX_STACK_ALIGNMENT);
6303 }
6304
6305
6306 /* Return whether the data reference DR_INFO is supported with respect to its
6307    alignment.
6308    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6309    it is aligned, i.e., check if it is possible to vectorize it with different
6310    alignment.  */
6311
6312 enum dr_alignment_support
6313 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6314                                bool check_aligned_accesses)
6315 {
6316   data_reference *dr = dr_info->dr;
6317   stmt_vec_info stmt_info = dr_info->stmt;
6318   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6319   machine_mode mode = TYPE_MODE (vectype);
6320   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6321   struct loop *vect_loop = NULL;
6322   bool nested_in_vect_loop = false;
6323
6324   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6325     return dr_aligned;
6326
6327   /* For now assume all conditional loads/stores support unaligned
6328      access without any special code.  */
6329   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6330     if (gimple_call_internal_p (stmt)
6331         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6332             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6333       return dr_unaligned_supported;
6334
6335   if (loop_vinfo)
6336     {
6337       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6338       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6339     }
6340
6341   /* Possibly unaligned access.  */
6342
6343   /* We can choose between using the implicit realignment scheme (generating
6344      a misaligned_move stmt) and the explicit realignment scheme (generating
6345      aligned loads with a REALIGN_LOAD).  There are two variants to the
6346      explicit realignment scheme: optimized, and unoptimized.
6347      We can optimize the realignment only if the step between consecutive
6348      vector loads is equal to the vector size.  Since the vector memory
6349      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6350      is guaranteed that the misalignment amount remains the same throughout the
6351      execution of the vectorized loop.  Therefore, we can create the
6352      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6353      at the loop preheader.
6354
6355      However, in the case of outer-loop vectorization, when vectorizing a
6356      memory access in the inner-loop nested within the LOOP that is now being
6357      vectorized, while it is guaranteed that the misalignment of the
6358      vectorized memory access will remain the same in different outer-loop
6359      iterations, it is *not* guaranteed that is will remain the same throughout
6360      the execution of the inner-loop.  This is because the inner-loop advances
6361      with the original scalar step (and not in steps of VS).  If the inner-loop
6362      step happens to be a multiple of VS, then the misalignment remains fixed
6363      and we can use the optimized realignment scheme.  For example:
6364
6365       for (i=0; i<N; i++)
6366         for (j=0; j<M; j++)
6367           s += a[i+j];
6368
6369      When vectorizing the i-loop in the above example, the step between
6370      consecutive vector loads is 1, and so the misalignment does not remain
6371      fixed across the execution of the inner-loop, and the realignment cannot
6372      be optimized (as illustrated in the following pseudo vectorized loop):
6373
6374       for (i=0; i<N; i+=4)
6375         for (j=0; j<M; j++){
6376           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6377                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6378                          // (assuming that we start from an aligned address).
6379           }
6380
6381      We therefore have to use the unoptimized realignment scheme:
6382
6383       for (i=0; i<N; i+=4)
6384           for (j=k; j<M; j+=4)
6385           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6386                            // that the misalignment of the initial address is
6387                            // 0).
6388
6389      The loop can then be vectorized as follows:
6390
6391       for (k=0; k<4; k++){
6392         rt = get_realignment_token (&vp[k]);
6393         for (i=0; i<N; i+=4){
6394           v1 = vp[i+k];
6395           for (j=k; j<M; j+=4){
6396             v2 = vp[i+j+VS-1];
6397             va = REALIGN_LOAD <v1,v2,rt>;
6398             vs += va;
6399             v1 = v2;
6400           }
6401         }
6402     } */
6403
6404   if (DR_IS_READ (dr))
6405     {
6406       bool is_packed = false;
6407       tree type = (TREE_TYPE (DR_REF (dr)));
6408
6409       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6410           && (!targetm.vectorize.builtin_mask_for_load
6411               || targetm.vectorize.builtin_mask_for_load ()))
6412         {
6413           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6414
6415           /* If we are doing SLP then the accesses need not have the
6416              same alignment, instead it depends on the SLP group size.  */
6417           if (loop_vinfo
6418               && STMT_SLP_TYPE (stmt_info)
6419               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6420                               * (DR_GROUP_SIZE
6421                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6422                               TYPE_VECTOR_SUBPARTS (vectype)))
6423             ;
6424           else if (!loop_vinfo
6425                    || (nested_in_vect_loop
6426                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6427                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6428             return dr_explicit_realign;
6429           else
6430             return dr_explicit_realign_optimized;
6431         }
6432       if (!known_alignment_for_access_p (dr_info))
6433         is_packed = not_size_aligned (DR_REF (dr));
6434
6435       if (targetm.vectorize.support_vector_misalignment
6436             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6437         /* Can't software pipeline the loads, but can at least do them.  */
6438         return dr_unaligned_supported;
6439     }
6440   else
6441     {
6442       bool is_packed = false;
6443       tree type = (TREE_TYPE (DR_REF (dr)));
6444
6445       if (!known_alignment_for_access_p (dr_info))
6446         is_packed = not_size_aligned (DR_REF (dr));
6447
6448      if (targetm.vectorize.support_vector_misalignment
6449            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6450        return dr_unaligned_supported;
6451     }
6452
6453   /* Unsupported.  */
6454   return dr_unaligned_unsupported;
6455 }