gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 298   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 475                                               vect_dr_stmt (drb)))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 572     {
 573       LOOP_VINFO_DDRS (loop_vinfo)
 574         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 575                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 576       /* We need read-read dependences to compute
 577          STMT_VINFO_SAME_ALIGN_REFS.  */
 578       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 579                                           &LOOP_VINFO_DDRS (loop_vinfo),
 580                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 581                                           true);
 582       gcc_assert (res);
 583     }
 584
 585   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 586
 587   /* For epilogues we either have no aliases or alias versioning
 588      was applied to original loop.  Therefore we may just get max_vf
 589      using VF of original loop.  */
 590   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 591     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 592   else
 593     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 594       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 595         return false;
 596
 597   return true;
 598 }
 599
 600
 601 /* Function vect_slp_analyze_data_ref_dependence.
 602
 603    Return TRUE if there (might) exist a dependence between a memory-reference
 604    DRA and a memory-reference DRB.  When versioning for alias may check a
 605    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 606    the data dependence.  */
 607
 608 static bool
 609 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 610 {
 611   struct data_reference *dra = DDR_A (ddr);
 612   struct data_reference *drb = DDR_B (ddr);
 613
 614   /* We need to check dependences of statements marked as unvectorizable
 615      as well, they still can prohibit vectorization.  */
 616
 617   /* Independent data accesses.  */
 618   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 619     return false;
 620
 621   if (dra == drb)
 622     return false;
 623
 624   /* Read-read is OK.  */
 625   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 626     return false;
 627
 628   /* If dra and drb are part of the same interleaving chain consider
 629      them independent.  */
 630   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 631       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 632           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 633     return false;
 634
 635   /* Unknown data dependence.  */
 636   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 637     {
 638       if  (dump_enabled_p ())
 639         {
 640           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                            "can't determine dependence between ");
 642           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 643           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 644           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 645           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 646         }
 647     }
 648   else if (dump_enabled_p ())
 649     {
 650       dump_printf_loc (MSG_NOTE, vect_location,
 651                        "determined dependence between ");
 652       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 653       dump_printf (MSG_NOTE, " and ");
 654       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 655       dump_printf (MSG_NOTE,  "\n");
 656     }
 657
 658   return true;
 659 }
 660
 661
 662 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 663    contain the vector of scalar stores of this instance if we are
 664    disambiguating the loads.  */
 665
 666 static bool
 667 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 668                                    vec<gimple *> stores, gimple *last_store)
 669 {
 670   /* This walks over all stmts involved in the SLP load/store done
 671      in NODE verifying we can sink them up to the last stmt in the
 672      group.  */
 673   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 674   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 675     {
 676       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 677       if (access == last_access)
 678         continue;
 679       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 680       ao_ref ref;
 681       bool ref_initialized_p = false;
 682       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 683            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 684         {
 685           gimple *stmt = gsi_stmt (gsi);
 686           if (! gimple_vuse (stmt)
 687               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 688             continue;
 689
 690           /* If we couldn't record a (single) data reference for this
 691              stmt we have to resort to the alias oracle.  */
 692           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 693           if (!dr_b)
 694             {
 695               /* We are moving a store or sinking a load - this means
 696                  we cannot use TBAA for disambiguation.  */
 697               if (!ref_initialized_p)
 698                 ao_ref_init (&ref, DR_REF (dr_a));
 699               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 700                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 701                 return false;
 702               continue;
 703             }
 704
 705           bool dependent = false;
 706           /* If we run into a store of this same instance (we've just
 707              marked those) then delay dependence checking until we run
 708              into the last store because this is where it will have
 709              been sunk to (and we verify if we can do that as well).  */
 710           if (gimple_visited_p (stmt))
 711             {
 712               if (stmt != last_store)
 713                 continue;
 714               unsigned i;
 715               gimple *store;
 716               FOR_EACH_VEC_ELT (stores, i, store)
 717                 {
 718                   data_reference *store_dr
 719                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 720                   ddr_p ddr = initialize_data_dependence_relation
 721                                 (dr_a, store_dr, vNULL);
 722                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 723                   free_dependence_relation (ddr);
 724                   if (dependent)
 725                     break;
 726                 }
 727             }
 728           else
 729             {
 730               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 731                                                                dr_b, vNULL);
 732               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 733               free_dependence_relation (ddr);
 734             }
 735           if (dependent)
 736             return false;
 737         }
 738     }
 739   return true;
 740 }
 741
 742
 743 /* Function vect_analyze_data_ref_dependences.
 744
 745    Examine all the data references in the basic-block, and make sure there
 746    do not exist any data dependences between them.  Set *MAX_VF according to
 747    the maximum vectorization factor the data dependences allow.  */
 748
 749 bool
 750 vect_slp_analyze_instance_dependence (slp_instance instance)
 751 {
 752   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 753
 754   /* The stores of this instance are at the root of the SLP tree.  */
 755   slp_tree store = SLP_INSTANCE_TREE (instance);
 756   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 757     store = NULL;
 758
 759   /* Verify we can sink stores to the vectorized stmt insert location.  */
 760   gimple *last_store = NULL;
 761   if (store)
 762     {
 763       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 764         return false;
 765
 766       /* Mark stores in this instance and remember the last one.  */
 767       last_store = vect_find_last_scalar_stmt_in_slp (store);
 768       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 769         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 770     }
 771
 772   bool res = true;
 773
 774   /* Verify we can sink loads to the vectorized stmt insert location,
 775      special-casing stores of this instance.  */
 776   slp_tree load;
 777   unsigned int i;
 778   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 779     if (! vect_slp_analyze_node_dependences (instance, load,
 780                                              store
 781                                              ? SLP_TREE_SCALAR_STMTS (store)
 782                                              : vNULL, last_store))
 783       {
 784         res = false;
 785         break;
 786       }
 787
 788   /* Unset the visited flag.  */
 789   if (store)
 790     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 791       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 792
 793   return res;
 794 }
 795
 796 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 797    the statement that contains DRB, which is useful for recording in the
 798    dump file.  */
 799
 800 static void
 801 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 802                             innermost_loop_behavior *drb)
 803 {
 804   bool existed;
 805   innermost_loop_behavior *&entry
 806     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 807   if (!existed || entry->base_alignment < drb->base_alignment)
 808     {
 809       entry = drb;
 810       if (dump_enabled_p ())
 811         {
 812           dump_printf_loc (MSG_NOTE, vect_location,
 813                            "recording new base alignment for ");
 814           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 815           dump_printf (MSG_NOTE, "\n");
 816           dump_printf_loc (MSG_NOTE, vect_location,
 817                            "  alignment:    %d\n", drb->base_alignment);
 818           dump_printf_loc (MSG_NOTE, vect_location,
 819                            "  misalignment: %d\n", drb->base_misalignment);
 820           dump_printf_loc (MSG_NOTE, vect_location,
 821                            "  based on:     ");
 822           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 823         }
 824     }
 825 }
 826
 827 /* If the region we're going to vectorize is reached, all unconditional
 828    data references occur at least once.  We can therefore pool the base
 829    alignment guarantees from each unconditional reference.  Do this by
 830    going through all the data references in VINFO and checking whether
 831    the containing statement makes the reference unconditionally.  If so,
 832    record the alignment of the base address in VINFO so that it can be
 833    used for all other references with the same base.  */
 834
 835 void
 836 vect_record_base_alignments (vec_info *vinfo)
 837 {
 838   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 839   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 840   data_reference *dr;
 841   unsigned int i;
 842   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 843     {
 844       gimple *stmt = vect_dr_stmt (dr);
 845       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 846       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 847           && STMT_VINFO_VECTORIZABLE (stmt_info)
 848           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 849         {
 850           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 851
 852           /* If DR is nested in the loop that is being vectorized, we can also
 853              record the alignment of the base wrt the outer loop.  */
 854           if (loop && nested_in_vect_loop_p (loop, stmt))
 855             vect_record_base_alignment
 856                 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 857         }
 858     }
 859 }
 860
 861 /* Return the target alignment for the vectorized form of DR.  */
 862
 863 static unsigned int
 864 vect_calculate_target_alignment (struct data_reference *dr)
 865 {
 866   gimple *stmt = vect_dr_stmt (dr);
 867   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 868   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 869   return targetm.vectorize.preferred_vector_alignment (vectype);
 870 }
 871
 872 /* Function vect_compute_data_ref_alignment
 873
 874    Compute the misalignment of the data reference DR.
 875
 876    Output:
 877    1. DR_MISALIGNMENT (DR) is defined.
 878
 879    FOR NOW: No analysis is actually performed. Misalignment is calculated
 880    only for trivial cases. TODO.  */
 881
 882 static void
 883 vect_compute_data_ref_alignment (struct data_reference *dr)
 884 {
 885   gimple *stmt = vect_dr_stmt (dr);
 886   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 887   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 888   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 889   struct loop *loop = NULL;
 890   tree ref = DR_REF (dr);
 891   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 892
 893   if (dump_enabled_p ())
 894     dump_printf_loc (MSG_NOTE, vect_location,
 895                      "vect_compute_data_ref_alignment:\n");
 896
 897   if (loop_vinfo)
 898     loop = LOOP_VINFO_LOOP (loop_vinfo);
 899
 900   /* Initialize misalignment to unknown.  */
 901   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 902
 903   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 904     return;
 905
 906   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 907   bool step_preserves_misalignment_p;
 908
 909   unsigned HOST_WIDE_INT vector_alignment
 910     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 911   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 912
 913   /* No step for BB vectorization.  */
 914   if (!loop)
 915     {
 916       gcc_assert (integer_zerop (drb->step));
 917       step_preserves_misalignment_p = true;
 918     }
 919
 920   /* In case the dataref is in an inner-loop of the loop that is being
 921      vectorized (LOOP), we use the base and misalignment information
 922      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 923      stays the same throughout the execution of the inner-loop, which is why
 924      we have to check that the stride of the dataref in the inner-loop evenly
 925      divides by the vector alignment.  */
 926   else if (nested_in_vect_loop_p (loop, stmt))
 927     {
 928       step_preserves_misalignment_p
 929         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 930
 931       if (dump_enabled_p ())
 932         {
 933           if (step_preserves_misalignment_p)
 934             dump_printf_loc (MSG_NOTE, vect_location,
 935                              "inner step divides the vector alignment.\n");
 936           else
 937             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 938                              "inner step doesn't divide the vector"
 939                              " alignment.\n");
 940         }
 941     }
 942
 943   /* Similarly we can only use base and misalignment information relative to
 944      an innermost loop if the misalignment stays the same throughout the
 945      execution of the loop.  As above, this is the case if the stride of
 946      the dataref evenly divides by the alignment.  */
 947   else
 948     {
 949       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 950       step_preserves_misalignment_p
 951         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 952
 953       if (!step_preserves_misalignment_p && dump_enabled_p ())
 954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 955                          "step doesn't divide the vector alignment.\n");
 956     }
 957
 958   unsigned int base_alignment = drb->base_alignment;
 959   unsigned int base_misalignment = drb->base_misalignment;
 960
 961   /* Calculate the maximum of the pooled base address alignment and the
 962      alignment that we can compute for DR itself.  */
 963   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 964   if (entry && base_alignment < (*entry)->base_alignment)
 965     {
 966       base_alignment = (*entry)->base_alignment;
 967       base_misalignment = (*entry)->base_misalignment;
 968     }
 969
 970   if (drb->offset_alignment < vector_alignment
 971       || !step_preserves_misalignment_p
 972       /* We need to know whether the step wrt the vectorized loop is
 973          negative when computing the starting misalignment below.  */
 974       || TREE_CODE (drb->step) != INTEGER_CST)
 975     {
 976       if (dump_enabled_p ())
 977         {
 978           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 979                            "Unknown alignment for access: ");
 980           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 981           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 982         }
 983       return;
 984     }
 985
 986   if (base_alignment < vector_alignment)
 987     {
 988       unsigned int max_alignment;
 989       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 990       if (max_alignment < vector_alignment
 991           || !vect_can_force_dr_alignment_p (base,
 992                                              vector_alignment * BITS_PER_UNIT))
 993         {
 994           if (dump_enabled_p ())
 995             {
 996               dump_printf_loc (MSG_NOTE, vect_location,
 997                                "can't force alignment of ref: ");
 998               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 999               dump_printf (MSG_NOTE, "\n");
1000             }
1001           return;
1002         }
1003
1004       /* Force the alignment of the decl.
1005          NOTE: This is the only change to the code we make during
1006          the analysis phase, before deciding to vectorize the loop.  */
1007       if (dump_enabled_p ())
1008         {
1009           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1010           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1011           dump_printf (MSG_NOTE, "\n");
1012         }
1013
1014       DR_VECT_AUX (dr)->base_decl = base;
1015       DR_VECT_AUX (dr)->base_misaligned = true;
1016       base_misalignment = 0;
1017     }
1018   poly_int64 misalignment
1019     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1020
1021   /* If this is a backward running DR then first access in the larger
1022      vectype actually is N-1 elements before the address in the DR.
1023      Adjust misalign accordingly.  */
1024   if (tree_int_cst_sgn (drb->step) < 0)
1025     /* PLUS because STEP is negative.  */
1026     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1027                      * TREE_INT_CST_LOW (drb->step));
1028
1029   unsigned int const_misalignment;
1030   if (!known_misalignment (misalignment, vector_alignment,
1031                            &const_misalignment))
1032     {
1033       if (dump_enabled_p ())
1034         {
1035           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036                            "Non-constant misalignment for access: ");
1037           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039         }
1040       return;
1041     }
1042
1043   SET_DR_MISALIGNMENT (dr, const_misalignment);
1044
1045   if (dump_enabled_p ())
1046     {
1047       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1049       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1050       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1051     }
1052
1053   return;
1054 }
1055
1056 /* Function vect_update_misalignment_for_peel.
1057    Sets DR's misalignment
1058    - to 0 if it has the same alignment as DR_PEEL,
1059    - to the misalignment computed using NPEEL if DR's salignment is known,
1060    - to -1 (unknown) otherwise.
1061
1062    DR - the data reference whose misalignment is to be adjusted.
1063    DR_PEEL - the data reference whose misalignment is being made
1064              zero in the vector loop by the peel.
1065    NPEEL - the number of iterations in the peel loop if the misalignment
1066            of DR_PEEL is known at compile time.  */
1067
1068 static void
1069 vect_update_misalignment_for_peel (struct data_reference *dr,
1070                                    struct data_reference *dr_peel, int npeel)
1071 {
1072   unsigned int i;
1073   vec<dr_p> same_aligned_drs;
1074   struct data_reference *current_dr;
1075   int dr_size = vect_get_scalar_dr_size (dr);
1076   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1077   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1078   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1079
1080  /* For interleaved data accesses the step in the loop must be multiplied by
1081      the size of the interleaving group.  */
1082   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1083     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1084   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1085     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1086
1087   /* It can be assumed that the data refs with the same alignment as dr_peel
1088      are aligned in the vector loop.  */
1089   same_aligned_drs
1090     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1091   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1092     {
1093       if (current_dr != dr)
1094         continue;
1095       gcc_assert (!known_alignment_for_access_p (dr)
1096                   || !known_alignment_for_access_p (dr_peel)
1097                   || (DR_MISALIGNMENT (dr) / dr_size
1098                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1099       SET_DR_MISALIGNMENT (dr, 0);
1100       return;
1101     }
1102
1103   if (known_alignment_for_access_p (dr)
1104       && known_alignment_for_access_p (dr_peel))
1105     {
1106       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1107       int misal = DR_MISALIGNMENT (dr);
1108       misal += negative ? -npeel * dr_size : npeel * dr_size;
1109       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1110       SET_DR_MISALIGNMENT (dr, misal);
1111       return;
1112     }
1113
1114   if (dump_enabled_p ())
1115     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1116                      "to unknown (-1).\n");
1117   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1118 }
1119
1120
1121 /* Function verify_data_ref_alignment
1122
1123    Return TRUE if DR can be handled with respect to alignment.  */
1124
1125 static bool
1126 verify_data_ref_alignment (data_reference_p dr)
1127 {
1128   enum dr_alignment_support supportable_dr_alignment
1129     = vect_supportable_dr_alignment (dr, false);
1130   if (!supportable_dr_alignment)
1131     {
1132       if (dump_enabled_p ())
1133         {
1134           if (DR_IS_READ (dr))
1135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136                              "not vectorized: unsupported unaligned load.");
1137           else
1138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139                              "not vectorized: unsupported unaligned "
1140                              "store.");
1141
1142           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1143                              DR_REF (dr));
1144           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1145         }
1146       return false;
1147     }
1148
1149   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1150     dump_printf_loc (MSG_NOTE, vect_location,
1151                      "Vectorizing an unaligned access.\n");
1152
1153   return true;
1154 }
1155
1156 /* Function vect_verify_datarefs_alignment
1157
1158    Return TRUE if all data references in the loop can be
1159    handled with respect to alignment.  */
1160
1161 bool
1162 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1163 {
1164   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1165   struct data_reference *dr;
1166   unsigned int i;
1167
1168   FOR_EACH_VEC_ELT (datarefs, i, dr)
1169     {
1170       gimple *stmt = vect_dr_stmt (dr);
1171       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1172
1173       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1174         continue;
1175
1176       /* For interleaving, only the alignment of the first access matters.   */
1177       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1178           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1179         continue;
1180
1181       /* Strided accesses perform only component accesses, alignment is
1182          irrelevant for them.  */
1183       if (STMT_VINFO_STRIDED_P (stmt_info)
1184           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1185         continue;
1186
1187       if (! verify_data_ref_alignment (dr))
1188         return false;
1189     }
1190
1191   return true;
1192 }
1193
1194 /* Given an memory reference EXP return whether its alignment is less
1195    than its size.  */
1196
1197 static bool
1198 not_size_aligned (tree exp)
1199 {
1200   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1201     return true;
1202
1203   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1204           > get_object_alignment (exp));
1205 }
1206
1207 /* Function vector_alignment_reachable_p
1208
1209    Return true if vector alignment for DR is reachable by peeling
1210    a few loop iterations.  Return false otherwise.  */
1211
1212 static bool
1213 vector_alignment_reachable_p (struct data_reference *dr)
1214 {
1215   gimple *stmt = vect_dr_stmt (dr);
1216   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1217   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1218
1219   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1220     {
1221       /* For interleaved access we peel only if number of iterations in
1222          the prolog loop ({VF - misalignment}), is a multiple of the
1223          number of the interleaved accesses.  */
1224       int elem_size, mis_in_elements;
1225
1226       /* FORNOW: handle only known alignment.  */
1227       if (!known_alignment_for_access_p (dr))
1228         return false;
1229
1230       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1231       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1232       elem_size = vector_element_size (vector_size, nelements);
1233       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1234
1235       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1236         return false;
1237     }
1238
1239   /* If misalignment is known at the compile time then allow peeling
1240      only if natural alignment is reachable through peeling.  */
1241   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1242     {
1243       HOST_WIDE_INT elmsize =
1244                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1245       if (dump_enabled_p ())
1246         {
1247           dump_printf_loc (MSG_NOTE, vect_location,
1248                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1249           dump_printf (MSG_NOTE,
1250                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1251         }
1252       if (DR_MISALIGNMENT (dr) % elmsize)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "data size does not divide the misalignment.\n");
1257           return false;
1258         }
1259     }
1260
1261   if (!known_alignment_for_access_p (dr))
1262     {
1263       tree type = TREE_TYPE (DR_REF (dr));
1264       bool is_packed = not_size_aligned (DR_REF (dr));
1265       if (dump_enabled_p ())
1266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267                          "Unknown misalignment, %snaturally aligned\n",
1268                          is_packed ? "not " : "");
1269       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1270     }
1271
1272   return true;
1273 }
1274
1275
1276 /* Calculate the cost of the memory access represented by DR.  */
1277
1278 static void
1279 vect_get_data_access_cost (struct data_reference *dr,
1280                            unsigned int *inside_cost,
1281                            unsigned int *outside_cost,
1282                            stmt_vector_for_cost *body_cost_vec,
1283                            stmt_vector_for_cost *prologue_cost_vec)
1284 {
1285   gimple *stmt = vect_dr_stmt (dr);
1286   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1287   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1288   int ncopies;
1289
1290   if (PURE_SLP_STMT (stmt_info))
1291     ncopies = 1;
1292   else
1293     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1294
1295   if (DR_IS_READ (dr))
1296     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1297                         prologue_cost_vec, body_cost_vec, false);
1298   else
1299     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1300
1301   if (dump_enabled_p ())
1302     dump_printf_loc (MSG_NOTE, vect_location,
1303                      "vect_get_data_access_cost: inside_cost = %d, "
1304                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1305 }
1306
1307
1308 typedef struct _vect_peel_info
1309 {
1310   struct data_reference *dr;
1311   int npeel;
1312   unsigned int count;
1313 } *vect_peel_info;
1314
1315 typedef struct _vect_peel_extended_info
1316 {
1317   struct _vect_peel_info peel_info;
1318   unsigned int inside_cost;
1319   unsigned int outside_cost;
1320 } *vect_peel_extended_info;
1321
1322
1323 /* Peeling hashtable helpers.  */
1324
1325 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1326 {
1327   static inline hashval_t hash (const _vect_peel_info *);
1328   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1329 };
1330
1331 inline hashval_t
1332 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1333 {
1334   return (hashval_t) peel_info->npeel;
1335 }
1336
1337 inline bool
1338 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1339 {
1340   return (a->npeel == b->npeel);
1341 }
1342
1343
1344 /* Insert DR into peeling hash table with NPEEL as key.  */
1345
1346 static void
1347 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1348                           loop_vec_info loop_vinfo, struct data_reference *dr,
1349                           int npeel)
1350 {
1351   struct _vect_peel_info elem, *slot;
1352   _vect_peel_info **new_slot;
1353   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1354
1355   elem.npeel = npeel;
1356   slot = peeling_htab->find (&elem);
1357   if (slot)
1358     slot->count++;
1359   else
1360     {
1361       slot = XNEW (struct _vect_peel_info);
1362       slot->npeel = npeel;
1363       slot->dr = dr;
1364       slot->count = 1;
1365       new_slot = peeling_htab->find_slot (slot, INSERT);
1366       *new_slot = slot;
1367     }
1368
1369   if (!supportable_dr_alignment
1370       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1371     slot->count += VECT_MAX_COST;
1372 }
1373
1374
1375 /* Traverse peeling hash table to find peeling option that aligns maximum
1376    number of data accesses.  */
1377
1378 int
1379 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1380                                      _vect_peel_extended_info *max)
1381 {
1382   vect_peel_info elem = *slot;
1383
1384   if (elem->count > max->peel_info.count
1385       || (elem->count == max->peel_info.count
1386           && max->peel_info.npeel > elem->npeel))
1387     {
1388       max->peel_info.npeel = elem->npeel;
1389       max->peel_info.count = elem->count;
1390       max->peel_info.dr = elem->dr;
1391     }
1392
1393   return 1;
1394 }
1395
1396 /* Get the costs of peeling NPEEL iterations checking data access costs
1397    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1398    misalignment will be zero after peeling.  */
1399
1400 static void
1401 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1402                                 struct data_reference *dr0,
1403                                 unsigned int *inside_cost,
1404                                 unsigned int *outside_cost,
1405                                 stmt_vector_for_cost *body_cost_vec,
1406                                 stmt_vector_for_cost *prologue_cost_vec,
1407                                 unsigned int npeel,
1408                                 bool unknown_misalignment)
1409 {
1410   unsigned i;
1411   data_reference *dr;
1412
1413   FOR_EACH_VEC_ELT (datarefs, i, dr)
1414     {
1415       gimple *stmt = vect_dr_stmt (dr);
1416       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1417       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1418         continue;
1419
1420       /* For interleaving, only the alignment of the first access
1421          matters.  */
1422       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1423           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1424         continue;
1425
1426       /* Strided accesses perform only component accesses, alignment is
1427          irrelevant for them.  */
1428       if (STMT_VINFO_STRIDED_P (stmt_info)
1429           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1430         continue;
1431
1432       int save_misalignment;
1433       save_misalignment = DR_MISALIGNMENT (dr);
1434       if (npeel == 0)
1435         ;
1436       else if (unknown_misalignment && dr == dr0)
1437         SET_DR_MISALIGNMENT (dr, 0);
1438       else
1439         vect_update_misalignment_for_peel (dr, dr0, npeel);
1440       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1441                                  body_cost_vec, prologue_cost_vec);
1442       SET_DR_MISALIGNMENT (dr, save_misalignment);
1443     }
1444 }
1445
1446 /* Traverse peeling hash table and calculate cost for each peeling option.
1447    Find the one with the lowest cost.  */
1448
1449 int
1450 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1451                                    _vect_peel_extended_info *min)
1452 {
1453   vect_peel_info elem = *slot;
1454   int dummy;
1455   unsigned int inside_cost = 0, outside_cost = 0;
1456   gimple *stmt = vect_dr_stmt (elem->dr);
1457   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1458   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1459   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1460                        epilogue_cost_vec;
1461
1462   prologue_cost_vec.create (2);
1463   body_cost_vec.create (2);
1464   epilogue_cost_vec.create (2);
1465
1466   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1467                                   elem->dr, &inside_cost, &outside_cost,
1468                                   &body_cost_vec, &prologue_cost_vec,
1469                                   elem->npeel, false);
1470
1471   body_cost_vec.release ();
1472
1473   outside_cost += vect_get_known_peeling_cost
1474     (loop_vinfo, elem->npeel, &dummy,
1475      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1476      &prologue_cost_vec, &epilogue_cost_vec);
1477
1478   /* Prologue and epilogue costs are added to the target model later.
1479      These costs depend only on the scalar iteration cost, the
1480      number of peeling iterations finally chosen, and the number of
1481      misaligned statements.  So discard the information found here.  */
1482   prologue_cost_vec.release ();
1483   epilogue_cost_vec.release ();
1484
1485   if (inside_cost < min->inside_cost
1486       || (inside_cost == min->inside_cost
1487           && outside_cost < min->outside_cost))
1488     {
1489       min->inside_cost = inside_cost;
1490       min->outside_cost = outside_cost;
1491       min->peel_info.dr = elem->dr;
1492       min->peel_info.npeel = elem->npeel;
1493       min->peel_info.count = elem->count;
1494     }
1495
1496   return 1;
1497 }
1498
1499
1500 /* Choose best peeling option by traversing peeling hash table and either
1501    choosing an option with the lowest cost (if cost model is enabled) or the
1502    option that aligns as many accesses as possible.  */
1503
1504 static struct _vect_peel_extended_info
1505 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1506                                        loop_vec_info loop_vinfo)
1507 {
1508    struct _vect_peel_extended_info res;
1509
1510    res.peel_info.dr = NULL;
1511
1512    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513      {
1514        res.inside_cost = INT_MAX;
1515        res.outside_cost = INT_MAX;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_lowest_cost> (&res);
1518      }
1519    else
1520      {
1521        res.peel_info.count = 0;
1522        peeling_htab->traverse <_vect_peel_extended_info *,
1523                                vect_peeling_hash_get_most_frequent> (&res);
1524        res.inside_cost = 0;
1525        res.outside_cost = 0;
1526      }
1527
1528    return res;
1529 }
1530
1531 /* Return true if the new peeling NPEEL is supported.  */
1532
1533 static bool
1534 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1535                           unsigned npeel)
1536 {
1537   unsigned i;
1538   struct data_reference *dr = NULL;
1539   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1540   gimple *stmt;
1541   stmt_vec_info stmt_info;
1542   enum dr_alignment_support supportable_dr_alignment;
1543
1544   /* Ensure that all data refs can be vectorized after the peel.  */
1545   FOR_EACH_VEC_ELT (datarefs, i, dr)
1546     {
1547       int save_misalignment;
1548
1549       if (dr == dr0)
1550         continue;
1551
1552       stmt = vect_dr_stmt (dr);
1553       stmt_info = vinfo_for_stmt (stmt);
1554       /* For interleaving, only the alignment of the first access
1555          matters.  */
1556       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1557           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1558         continue;
1559
1560       /* Strided accesses perform only component accesses, alignment is
1561          irrelevant for them.  */
1562       if (STMT_VINFO_STRIDED_P (stmt_info)
1563           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1564         continue;
1565
1566       save_misalignment = DR_MISALIGNMENT (dr);
1567       vect_update_misalignment_for_peel (dr, dr0, npeel);
1568       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1569       SET_DR_MISALIGNMENT (dr, save_misalignment);
1570
1571       if (!supportable_dr_alignment)
1572         return false;
1573     }
1574
1575   return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580    This pass will use loop versioning and loop peeling in order to enhance
1581    the alignment of data references in the loop.
1582
1583    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584    original loop is to be vectorized.  Any other loops that are created by
1585    the transformations performed in this pass - are not supposed to be
1586    vectorized.  This restriction will be relaxed.
1587
1588    This pass will require a cost model to guide it whether to apply peeling
1589    or versioning or a combination of the two.  For example, the scheme that
1590    intel uses when given a loop with several memory accesses, is as follows:
1591    choose one memory access ('p') which alignment you want to force by doing
1592    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593    other accesses are not necessarily aligned, or (2) use loop versioning to
1594    generate one loop in which all accesses are aligned, and another loop in
1595    which only 'p' is necessarily aligned.
1596
1597    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601    Devising a cost model is the most critical aspect of this work.  It will
1602    guide us on which access to peel for, whether to use loop versioning, how
1603    many versions to create, etc.  The cost model will probably consist of
1604    generic considerations as well as target specific considerations (on
1605    powerpc for example, misaligned stores are more painful than misaligned
1606    loads).
1607
1608    Here are the general steps involved in alignment enhancements:
1609
1610      -- original loop, before alignment analysis:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- After vect_compute_data_refs_alignment:
1617         for (i=0; i<N; i++){
1618           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1619           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1620         }
1621
1622      -- Possibility 1: we do loop versioning:
1623      if (p is aligned) {
1624         for (i=0; i<N; i++){    # loop 1A
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1627         }
1628      }
1629      else {
1630         for (i=0; i<N; i++){    # loop 1B
1631           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1632           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1633         }
1634      }
1635
1636      -- Possibility 2: we do loop peeling:
1637      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1638         x = q[i];
1639         p[i] = y;
1640      }
1641      for (i = 3; i < N; i++){   # loop 2A
1642         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1643         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1644      }
1645
1646      -- Possibility 3: combination of loop peeling and versioning:
1647      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1648         x = q[i];
1649         p[i] = y;
1650      }
1651      if (p is aligned) {
1652         for (i = 3; i<N; i++){  # loop 3A
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1655         }
1656      }
1657      else {
1658         for (i = 3; i<N; i++){  # loop 3B
1659           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1660           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1661         }
1662      }
1663
1664      These loops are later passed to loop_transform to be vectorized.  The
1665      vectorizer will use the alignment information to guide the transformation
1666      (whether to generate regular loads/stores, or with special handling for
1667      misalignment).  */
1668
1669 bool
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675   struct data_reference *dr0 = NULL, *first_store = NULL;
1676   struct data_reference *dr;
1677   unsigned int i, j;
1678   bool do_peeling = false;
1679   bool do_versioning = false;
1680   bool stat;
1681   gimple *stmt;
1682   stmt_vec_info stmt_info;
1683   unsigned int npeel = 0;
1684   bool one_misalignment_known = false;
1685   bool one_misalignment_unknown = false;
1686   bool one_dr_unsupportable = false;
1687   struct data_reference *unsupportable_dr = NULL;
1688   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1689   unsigned possible_npeel_number = 1;
1690   tree vectype;
1691   unsigned int mis, same_align_drs_max = 0;
1692   hash_table<peel_info_hasher> peeling_htab (1);
1693
1694   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1695
1696   /* Reset data so we can safely be called multiple times.  */
1697   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1698   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1699
1700   /* While cost model enhancements are expected in the future, the high level
1701      view of the code at this time is as follows:
1702
1703      A) If there is a misaligned access then see if peeling to align
1704         this access can make all data references satisfy
1705         vect_supportable_dr_alignment.  If so, update data structures
1706         as needed and return true.
1707
1708      B) If peeling wasn't possible and there is a data reference with an
1709         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1710         then see if loop versioning checks can be used to make all data
1711         references satisfy vect_supportable_dr_alignment.  If so, update
1712         data structures as needed and return true.
1713
1714      C) If neither peeling nor versioning were successful then return false if
1715         any data reference does not satisfy vect_supportable_dr_alignment.
1716
1717      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1718
1719      Note, Possibility 3 above (which is peeling and versioning together) is not
1720      being done at this time.  */
1721
1722   /* (1) Peeling to force alignment.  */
1723
1724   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1725      Considerations:
1726      + How many accesses will become aligned due to the peeling
1727      - How many accesses will become unaligned due to the peeling,
1728        and the cost of misaligned accesses.
1729      - The cost of peeling (the extra runtime checks, the increase
1730        in code size).  */
1731
1732   FOR_EACH_VEC_ELT (datarefs, i, dr)
1733     {
1734       stmt = vect_dr_stmt (dr);
1735       stmt_info = vinfo_for_stmt (stmt);
1736
1737       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1738         continue;
1739
1740       /* For interleaving, only the alignment of the first access
1741          matters.  */
1742       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1743           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1744         continue;
1745
1746       /* For scatter-gather or invariant accesses there is nothing
1747          to enhance.  */
1748       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1749           || integer_zerop (DR_STEP (dr)))
1750         continue;
1751
1752       /* Strided accesses perform only component accesses, alignment is
1753          irrelevant for them.  */
1754       if (STMT_VINFO_STRIDED_P (stmt_info)
1755           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1756         continue;
1757
1758       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1759       do_peeling = vector_alignment_reachable_p (dr);
1760       if (do_peeling)
1761         {
1762           if (known_alignment_for_access_p (dr))
1763             {
1764               unsigned int npeel_tmp = 0;
1765               bool negative = tree_int_cst_compare (DR_STEP (dr),
1766                                                     size_zero_node) < 0;
1767
1768               vectype = STMT_VINFO_VECTYPE (stmt_info);
1769               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1770               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1771               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1772               if (DR_MISALIGNMENT (dr) != 0)
1773                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1774
1775               /* For multiple types, it is possible that the bigger type access
1776                  will have more than one peeling option.  E.g., a loop with two
1777                  types: one of size (vector size / 4), and the other one of
1778                  size (vector size / 8).  Vectorization factor will 8.  If both
1779                  accesses are misaligned by 3, the first one needs one scalar
1780                  iteration to be aligned, and the second one needs 5.  But the
1781                  first one will be aligned also by peeling 5 scalar
1782                  iterations, and in that case both accesses will be aligned.
1783                  Hence, except for the immediate peeling amount, we also want
1784                  to try to add full vector size, while we don't exceed
1785                  vectorization factor.
1786                  We do this automatically for cost model, since we calculate
1787                  cost for every peeling option.  */
1788               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1789                 {
1790                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1791                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1792                   possible_npeel_number
1793                     = vect_get_num_vectors (nscalars, vectype);
1794
1795                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1796                      allow peeling NELEMENTS.  */
1797                   if (DR_MISALIGNMENT (dr) == 0)
1798                     possible_npeel_number++;
1799                 }
1800
1801               /* Save info about DR in the hash table.  Also include peeling
1802                  amounts according to the explanation above.  */
1803               for (j = 0; j < possible_npeel_number; j++)
1804                 {
1805                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1806                                             dr, npeel_tmp);
1807                   npeel_tmp += target_align / dr_size;
1808                 }
1809
1810               one_misalignment_known = true;
1811             }
1812           else
1813             {
1814               /* If we don't know any misalignment values, we prefer
1815                  peeling for data-ref that has the maximum number of data-refs
1816                  with the same alignment, unless the target prefers to align
1817                  stores over load.  */
1818               unsigned same_align_drs
1819                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1820               if (!dr0
1821                   || same_align_drs_max < same_align_drs)
1822                 {
1823                   same_align_drs_max = same_align_drs;
1824                   dr0 = dr;
1825                 }
1826               /* For data-refs with the same number of related
1827                  accesses prefer the one where the misalign
1828                  computation will be invariant in the outermost loop.  */
1829               else if (same_align_drs_max == same_align_drs)
1830                 {
1831                   struct loop *ivloop0, *ivloop;
1832                   ivloop0 = outermost_invariant_loop_for_expr
1833                     (loop, DR_BASE_ADDRESS (dr0));
1834                   ivloop = outermost_invariant_loop_for_expr
1835                     (loop, DR_BASE_ADDRESS (dr));
1836                   if ((ivloop && !ivloop0)
1837                       || (ivloop && ivloop0
1838                           && flow_loop_nested_p (ivloop, ivloop0)))
1839                     dr0 = dr;
1840                 }
1841
1842               one_misalignment_unknown = true;
1843
1844               /* Check for data refs with unsupportable alignment that
1845                  can be peeled.  */
1846               if (!supportable_dr_alignment)
1847               {
1848                 one_dr_unsupportable = true;
1849                 unsupportable_dr = dr;
1850               }
1851
1852               if (!first_store && DR_IS_WRITE (dr))
1853                 first_store = dr;
1854             }
1855         }
1856       else
1857         {
1858           if (!aligned_access_p (dr))
1859             {
1860               if (dump_enabled_p ())
1861                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1862                                  "vector alignment may not be reachable\n");
1863               break;
1864             }
1865         }
1866     }
1867
1868   /* Check if we can possibly peel the loop.  */
1869   if (!vect_can_advance_ivs_p (loop_vinfo)
1870       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1871       || loop->inner)
1872     do_peeling = false;
1873
1874   struct _vect_peel_extended_info peel_for_known_alignment;
1875   struct _vect_peel_extended_info peel_for_unknown_alignment;
1876   struct _vect_peel_extended_info best_peel;
1877
1878   peel_for_unknown_alignment.inside_cost = INT_MAX;
1879   peel_for_unknown_alignment.outside_cost = INT_MAX;
1880   peel_for_unknown_alignment.peel_info.count = 0;
1881
1882   if (do_peeling
1883       && one_misalignment_unknown)
1884     {
1885       /* Check if the target requires to prefer stores over loads, i.e., if
1886          misaligned stores are more expensive than misaligned loads (taking
1887          drs with same alignment into account).  */
1888       unsigned int load_inside_cost = 0;
1889       unsigned int load_outside_cost = 0;
1890       unsigned int store_inside_cost = 0;
1891       unsigned int store_outside_cost = 0;
1892       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1893
1894       stmt_vector_for_cost dummy;
1895       dummy.create (2);
1896       vect_get_peeling_costs_all_drs (datarefs, dr0,
1897                                       &load_inside_cost,
1898                                       &load_outside_cost,
1899                                       &dummy, &dummy, estimated_npeels, true);
1900       dummy.release ();
1901
1902       if (first_store)
1903         {
1904           dummy.create (2);
1905           vect_get_peeling_costs_all_drs (datarefs, first_store,
1906                                           &store_inside_cost,
1907                                           &store_outside_cost,
1908                                           &dummy, &dummy,
1909                                           estimated_npeels, true);
1910           dummy.release ();
1911         }
1912       else
1913         {
1914           store_inside_cost = INT_MAX;
1915           store_outside_cost = INT_MAX;
1916         }
1917
1918       if (load_inside_cost > store_inside_cost
1919           || (load_inside_cost == store_inside_cost
1920               && load_outside_cost > store_outside_cost))
1921         {
1922           dr0 = first_store;
1923           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1924           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1925         }
1926       else
1927         {
1928           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1929           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1930         }
1931
1932       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1933       prologue_cost_vec.create (2);
1934       epilogue_cost_vec.create (2);
1935
1936       int dummy2;
1937       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1938         (loop_vinfo, estimated_npeels, &dummy2,
1939          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1940          &prologue_cost_vec, &epilogue_cost_vec);
1941
1942       prologue_cost_vec.release ();
1943       epilogue_cost_vec.release ();
1944
1945       peel_for_unknown_alignment.peel_info.count = 1
1946         + STMT_VINFO_SAME_ALIGN_REFS
1947         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1948     }
1949
1950   peel_for_unknown_alignment.peel_info.npeel = 0;
1951   peel_for_unknown_alignment.peel_info.dr = dr0;
1952
1953   best_peel = peel_for_unknown_alignment;
1954
1955   peel_for_known_alignment.inside_cost = INT_MAX;
1956   peel_for_known_alignment.outside_cost = INT_MAX;
1957   peel_for_known_alignment.peel_info.count = 0;
1958   peel_for_known_alignment.peel_info.dr = NULL;
1959
1960   if (do_peeling && one_misalignment_known)
1961     {
1962       /* Peeling is possible, but there is no data access that is not supported
1963          unless aligned.  So we try to choose the best possible peeling from
1964          the hash table.  */
1965       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1966         (&peeling_htab, loop_vinfo);
1967     }
1968
1969   /* Compare costs of peeling for known and unknown alignment. */
1970   if (peel_for_known_alignment.peel_info.dr != NULL
1971       && peel_for_unknown_alignment.inside_cost
1972       >= peel_for_known_alignment.inside_cost)
1973     {
1974       best_peel = peel_for_known_alignment;
1975
1976       /* If the best peeling for known alignment has NPEEL == 0, perform no
1977          peeling at all except if there is an unsupportable dr that we can
1978          align.  */
1979       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1980         do_peeling = false;
1981     }
1982
1983   /* If there is an unsupportable data ref, prefer this over all choices so far
1984      since we'd have to discard a chosen peeling except when it accidentally
1985      aligned the unsupportable data ref.  */
1986   if (one_dr_unsupportable)
1987     dr0 = unsupportable_dr;
1988   else if (do_peeling)
1989     {
1990       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991          TODO: Use nopeel_outside_cost or get rid of it?  */
1992       unsigned nopeel_inside_cost = 0;
1993       unsigned nopeel_outside_cost = 0;
1994
1995       stmt_vector_for_cost dummy;
1996       dummy.create (2);
1997       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1998                                       &nopeel_outside_cost, &dummy, &dummy,
1999                                       0, false);
2000       dummy.release ();
2001
2002       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2003          costs will be recorded.  */
2004       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2005       prologue_cost_vec.create (2);
2006       epilogue_cost_vec.create (2);
2007
2008       int dummy2;
2009       nopeel_outside_cost += vect_get_known_peeling_cost
2010         (loop_vinfo, 0, &dummy2,
2011          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2012          &prologue_cost_vec, &epilogue_cost_vec);
2013
2014       prologue_cost_vec.release ();
2015       epilogue_cost_vec.release ();
2016
2017       npeel = best_peel.peel_info.npeel;
2018       dr0 = best_peel.peel_info.dr;
2019
2020       /* If no peeling is not more expensive than the best peeling we
2021          have so far, don't perform any peeling.  */
2022       if (nopeel_inside_cost <= best_peel.inside_cost)
2023         do_peeling = false;
2024     }
2025
2026   if (do_peeling)
2027     {
2028       stmt = vect_dr_stmt (dr0);
2029       stmt_info = vinfo_for_stmt (stmt);
2030       vectype = STMT_VINFO_VECTYPE (stmt_info);
2031
2032       if (known_alignment_for_access_p (dr0))
2033         {
2034           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2035                                                 size_zero_node) < 0;
2036           if (!npeel)
2037             {
2038               /* Since it's known at compile time, compute the number of
2039                  iterations in the peeled loop (the peeling factor) for use in
2040                  updating DR_MISALIGNMENT values.  The peeling factor is the
2041                  vectorization factor minus the misalignment as an element
2042                  count.  */
2043               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2044               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2045               npeel = ((mis & (target_align - 1))
2046                        / vect_get_scalar_dr_size (dr0));
2047             }
2048
2049           /* For interleaved data access every iteration accesses all the
2050              members of the group, therefore we divide the number of iterations
2051              by the group size.  */
2052           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2053           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054             npeel /= DR_GROUP_SIZE (stmt_info);
2055
2056           if (dump_enabled_p ())
2057             dump_printf_loc (MSG_NOTE, vect_location,
2058                              "Try peeling by %d\n", npeel);
2059         }
2060
2061       /* Ensure that all datarefs can be vectorized after the peel.  */
2062       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2063         do_peeling = false;
2064
2065       /* Check if all datarefs are supportable and log.  */
2066       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2067         {
2068           stat = vect_verify_datarefs_alignment (loop_vinfo);
2069           if (!stat)
2070             do_peeling = false;
2071           else
2072             return stat;
2073         }
2074
2075       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2076       if (do_peeling)
2077         {
2078           unsigned max_allowed_peel
2079             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2080           if (max_allowed_peel != (unsigned)-1)
2081             {
2082               unsigned max_peel = npeel;
2083               if (max_peel == 0)
2084                 {
2085                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2086                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2087                 }
2088               if (max_peel > max_allowed_peel)
2089                 {
2090                   do_peeling = false;
2091                   if (dump_enabled_p ())
2092                     dump_printf_loc (MSG_NOTE, vect_location,
2093                         "Disable peeling, max peels reached: %d\n", max_peel);
2094                 }
2095             }
2096         }
2097
2098       /* Cost model #2 - if peeling may result in a remaining loop not
2099          iterating enough to be vectorized then do not peel.  Since this
2100          is a cost heuristic rather than a correctness decision, use the
2101          most likely runtime value for variable vectorization factors.  */
2102       if (do_peeling
2103           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2104         {
2105           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2106           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2107           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2108               < assumed_vf + max_peel)
2109             do_peeling = false;
2110         }
2111
2112       if (do_peeling)
2113         {
2114           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2115              If the misalignment of DR_i is identical to that of dr0 then set
2116              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2117              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2118              by the peeling factor times the element size of DR_i (MOD the
2119              vectorization factor times the size).  Otherwise, the
2120              misalignment of DR_i must be set to unknown.  */
2121           FOR_EACH_VEC_ELT (datarefs, i, dr)
2122             if (dr != dr0)
2123               {
2124                 /* Strided accesses perform only component accesses, alignment
2125                    is irrelevant for them.  */
2126                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2127                 if (STMT_VINFO_STRIDED_P (stmt_info)
2128                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2129                   continue;
2130
2131                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2132               }
2133
2134           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2135           if (npeel)
2136             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2137           else
2138             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2139               = DR_MISALIGNMENT (dr0);
2140           SET_DR_MISALIGNMENT (dr0, 0);
2141           if (dump_enabled_p ())
2142             {
2143               dump_printf_loc (MSG_NOTE, vect_location,
2144                                "Alignment of access forced using peeling.\n");
2145               dump_printf_loc (MSG_NOTE, vect_location,
2146                                "Peeling for alignment will be applied.\n");
2147             }
2148
2149           /* The inside-loop cost will be accounted for in vectorizable_load
2150              and vectorizable_store correctly with adjusted alignments.
2151              Drop the body_cst_vec on the floor here.  */
2152           stat = vect_verify_datarefs_alignment (loop_vinfo);
2153           gcc_assert (stat);
2154           return stat;
2155         }
2156     }
2157
2158   /* (2) Versioning to force alignment.  */
2159
2160   /* Try versioning if:
2161      1) optimize loop for speed
2162      2) there is at least one unsupported misaligned data ref with an unknown
2163         misalignment, and
2164      3) all misaligned data refs with a known misalignment are supported, and
2165      4) the number of runtime alignment checks is within reason.  */
2166
2167   do_versioning =
2168         optimize_loop_nest_for_speed_p (loop)
2169         && (!loop->inner); /* FORNOW */
2170
2171   if (do_versioning)
2172     {
2173       FOR_EACH_VEC_ELT (datarefs, i, dr)
2174         {
2175           stmt = vect_dr_stmt (dr);
2176           stmt_info = vinfo_for_stmt (stmt);
2177
2178           /* For interleaving, only the alignment of the first access
2179              matters.  */
2180           if (aligned_access_p (dr)
2181               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2182                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2183             continue;
2184
2185           if (STMT_VINFO_STRIDED_P (stmt_info))
2186             {
2187               /* Strided loads perform only component accesses, alignment is
2188                  irrelevant for them.  */
2189               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2190                 continue;
2191               do_versioning = false;
2192               break;
2193             }
2194
2195           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2196
2197           if (!supportable_dr_alignment)
2198             {
2199               gimple *stmt;
2200               int mask;
2201               tree vectype;
2202
2203               if (known_alignment_for_access_p (dr)
2204                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2205                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2206                 {
2207                   do_versioning = false;
2208                   break;
2209                 }
2210
2211               stmt = vect_dr_stmt (dr);
2212               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2213               gcc_assert (vectype);
2214
2215               /* At present we don't support versioning for alignment
2216                  with variable VF, since there's no guarantee that the
2217                  VF is a power of two.  We could relax this if we added
2218                  a way of enforcing a power-of-two size.  */
2219               unsigned HOST_WIDE_INT size;
2220               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2221                 {
2222                   do_versioning = false;
2223                   break;
2224                 }
2225
2226               /* The rightmost bits of an aligned address must be zeros.
2227                  Construct the mask needed for this test.  For example,
2228                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2229                  mask must be 15 = 0xf. */
2230               mask = size - 1;
2231
2232               /* FORNOW: use the same mask to test all potentially unaligned
2233                  references in the loop.  The vectorizer currently supports
2234                  a single vector size, see the reference to
2235                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2236                  vectorization factor is computed.  */
2237               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2238                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2239               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2240               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2241                       vect_dr_stmt (dr));
2242             }
2243         }
2244
2245       /* Versioning requires at least one misaligned data reference.  */
2246       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2247         do_versioning = false;
2248       else if (!do_versioning)
2249         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2250     }
2251
2252   if (do_versioning)
2253     {
2254       vec<gimple *> may_misalign_stmts
2255         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2256       gimple *stmt;
2257
2258       /* It can now be assumed that the data references in the statements
2259          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260          of the loop being vectorized.  */
2261       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2262         {
2263           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2264           dr = STMT_VINFO_DATA_REF (stmt_info);
2265           SET_DR_MISALIGNMENT (dr, 0);
2266           if (dump_enabled_p ())
2267             dump_printf_loc (MSG_NOTE, vect_location,
2268                              "Alignment of access forced using versioning.\n");
2269         }
2270
2271       if (dump_enabled_p ())
2272         dump_printf_loc (MSG_NOTE, vect_location,
2273                          "Versioning for alignment will be applied.\n");
2274
2275       /* Peeling and versioning can't be done together at this time.  */
2276       gcc_assert (! (do_peeling && do_versioning));
2277
2278       stat = vect_verify_datarefs_alignment (loop_vinfo);
2279       gcc_assert (stat);
2280       return stat;
2281     }
2282
2283   /* This point is reached if neither peeling nor versioning is being done.  */
2284   gcc_assert (! (do_peeling || do_versioning));
2285
2286   stat = vect_verify_datarefs_alignment (loop_vinfo);
2287   return stat;
2288 }
2289
2290
2291 /* Function vect_find_same_alignment_drs.
2292
2293    Update group and alignment relations according to the chosen
2294    vectorization factor.  */
2295
2296 static void
2297 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2298 {
2299   struct data_reference *dra = DDR_A (ddr);
2300   struct data_reference *drb = DDR_B (ddr);
2301   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2302   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2303
2304   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2305     return;
2306
2307   if (dra == drb)
2308     return;
2309
2310   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2311       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2312     return;
2313
2314   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2315       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2316       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2317     return;
2318
2319   /* Two references with distance zero have the same alignment.  */
2320   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2321                           - wi::to_poly_offset (DR_INIT (drb)));
2322   if (maybe_ne (diff, 0))
2323     {
2324       /* Get the wider of the two alignments.  */
2325       unsigned int align_a = (vect_calculate_target_alignment (dra)
2326                               / BITS_PER_UNIT);
2327       unsigned int align_b = (vect_calculate_target_alignment (drb)
2328                               / BITS_PER_UNIT);
2329       unsigned int max_align = MAX (align_a, align_b);
2330
2331       /* Require the gap to be a multiple of the larger vector alignment.  */
2332       if (!multiple_p (diff, max_align))
2333         return;
2334     }
2335
2336   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2337   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2338   if (dump_enabled_p ())
2339     {
2340       dump_printf_loc (MSG_NOTE, vect_location,
2341                        "accesses have the same alignment: ");
2342       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2343       dump_printf (MSG_NOTE,  " and ");
2344       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2345       dump_printf (MSG_NOTE, "\n");
2346     }
2347 }
2348
2349
2350 /* Function vect_analyze_data_refs_alignment
2351
2352    Analyze the alignment of the data-references in the loop.
2353    Return FALSE if a data reference is found that cannot be vectorized.  */
2354
2355 bool
2356 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2357 {
2358   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2359
2360   /* Mark groups of data references with same alignment using
2361      data dependence information.  */
2362   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2363   struct data_dependence_relation *ddr;
2364   unsigned int i;
2365
2366   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2367     vect_find_same_alignment_drs (ddr);
2368
2369   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2370   struct data_reference *dr;
2371
2372   vect_record_base_alignments (vinfo);
2373   FOR_EACH_VEC_ELT (datarefs, i, dr)
2374     {
2375       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2376       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2377         vect_compute_data_ref_alignment (dr);
2378     }
2379
2380   return true;
2381 }
2382
2383
2384 /* Analyze alignment of DRs of stmts in NODE.  */
2385
2386 static bool
2387 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2388 {
2389   /* We vectorize from the first scalar stmt in the node unless
2390      the node is permuted in which case we start from the first
2391      element in the group.  */
2392   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2393   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2394   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2395     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2396
2397   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2398   vect_compute_data_ref_alignment (dr);
2399   /* For creating the data-ref pointer we need alignment of the
2400      first element anyway.  */
2401   if (dr != first_dr)
2402     vect_compute_data_ref_alignment (first_dr);
2403   if (! verify_data_ref_alignment (dr))
2404     {
2405       if (dump_enabled_p ())
2406         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2407                          "not vectorized: bad data alignment in basic "
2408                          "block.\n");
2409       return false;
2410     }
2411
2412   return true;
2413 }
2414
2415 /* Function vect_slp_analyze_instance_alignment
2416
2417    Analyze the alignment of the data-references in the SLP instance.
2418    Return FALSE if a data reference is found that cannot be vectorized.  */
2419
2420 bool
2421 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2422 {
2423   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2424
2425   slp_tree node;
2426   unsigned i;
2427   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2428     if (! vect_slp_analyze_and_verify_node_alignment (node))
2429       return false;
2430
2431   node = SLP_INSTANCE_TREE (instance);
2432   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2433       && ! vect_slp_analyze_and_verify_node_alignment
2434              (SLP_INSTANCE_TREE (instance)))
2435     return false;
2436
2437   return true;
2438 }
2439
2440
2441 /* Analyze groups of accesses: check that DR belongs to a group of
2442    accesses of legal size, step, etc.  Detect gaps, single element
2443    interleaving, and other special cases. Set grouped access info.
2444    Collect groups of strided stores for further use in SLP analysis.
2445    Worker for vect_analyze_group_access.  */
2446
2447 static bool
2448 vect_analyze_group_access_1 (struct data_reference *dr)
2449 {
2450   tree step = DR_STEP (dr);
2451   tree scalar_type = TREE_TYPE (DR_REF (dr));
2452   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2453   gimple *stmt = vect_dr_stmt (dr);
2454   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2457   HOST_WIDE_INT dr_step = -1;
2458   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2459   bool slp_impossible = false;
2460
2461   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2462      size of the interleaving group (including gaps).  */
2463   if (tree_fits_shwi_p (step))
2464     {
2465       dr_step = tree_to_shwi (step);
2466       /* Check that STEP is a multiple of type size.  Otherwise there is
2467          a non-element-sized gap at the end of the group which we
2468          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2469          ???  As we can handle non-constant step fine here we should
2470          simply remove uses of DR_GROUP_GAP between the last and first
2471          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2472          simply not include that gap.  */
2473       if ((dr_step % type_size) != 0)
2474         {
2475           if (dump_enabled_p ())
2476             {
2477               dump_printf_loc (MSG_NOTE, vect_location,
2478                                "Step ");
2479               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2480               dump_printf (MSG_NOTE,
2481                            " is not a multiple of the element size for ");
2482               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2483               dump_printf (MSG_NOTE, "\n");
2484             }
2485           return false;
2486         }
2487       groupsize = absu_hwi (dr_step) / type_size;
2488     }
2489   else
2490     groupsize = 0;
2491
2492   /* Not consecutive access is possible only if it is a part of interleaving.  */
2493   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2494     {
2495       /* Check if it this DR is a part of interleaving, and is a single
2496          element of the group that is accessed in the loop.  */
2497
2498       /* Gaps are supported only for loads. STEP must be a multiple of the type
2499          size.  */
2500       if (DR_IS_READ (dr)
2501           && (dr_step % type_size) == 0
2502           && groupsize > 0)
2503         {
2504           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2505           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2506           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2507           if (dump_enabled_p ())
2508             {
2509               dump_printf_loc (MSG_NOTE, vect_location,
2510                                "Detected single element interleaving ");
2511               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2512               dump_printf (MSG_NOTE, " step ");
2513               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2514               dump_printf (MSG_NOTE, "\n");
2515             }
2516
2517           return true;
2518         }
2519
2520       if (dump_enabled_p ())
2521         {
2522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                            "not consecutive access ");
2524           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2525         }
2526
2527       if (bb_vinfo)
2528         {
2529           /* Mark the statement as unvectorizable.  */
2530           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2531           return true;
2532         }
2533
2534       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2535       STMT_VINFO_STRIDED_P (stmt_info) = true;
2536       return true;
2537     }
2538
2539   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2540     {
2541       /* First stmt in the interleaving chain. Check the chain.  */
2542       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2543       struct data_reference *data_ref = dr;
2544       unsigned int count = 1;
2545       tree prev_init = DR_INIT (data_ref);
2546       gimple *prev = stmt;
2547       HOST_WIDE_INT diff, gaps = 0;
2548
2549       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2550       while (next)
2551         {
2552           /* Skip same data-refs.  In case that two or more stmts share
2553              data-ref (supported only for loads), we vectorize only the first
2554              stmt, and the rest get their vectorized loads from the first
2555              one.  */
2556           if (!tree_int_cst_compare (DR_INIT (data_ref),
2557                                      DR_INIT (STMT_VINFO_DATA_REF (
2558                                                    vinfo_for_stmt (next)))))
2559             {
2560               if (DR_IS_WRITE (data_ref))
2561                 {
2562                   if (dump_enabled_p ())
2563                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564                                      "Two store stmts share the same dr.\n");
2565                   return false;
2566                 }
2567
2568               if (dump_enabled_p ())
2569                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2570                                  "Two or more load stmts share the same dr.\n");
2571
2572               /* For load use the same data-ref load.  */
2573               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2574
2575               prev = next;
2576               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2577               continue;
2578             }
2579
2580           prev = next;
2581           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2582
2583           /* All group members have the same STEP by construction.  */
2584           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2585
2586           /* Check that the distance between two accesses is equal to the type
2587              size. Otherwise, we have gaps.  */
2588           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2589                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2590           if (diff != 1)
2591             {
2592               /* FORNOW: SLP of accesses with gaps is not supported.  */
2593               slp_impossible = true;
2594               if (DR_IS_WRITE (data_ref))
2595                 {
2596                   if (dump_enabled_p ())
2597                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2598                                      "interleaved store with gaps\n");
2599                   return false;
2600                 }
2601
2602               gaps += diff - 1;
2603             }
2604
2605           last_accessed_element += diff;
2606
2607           /* Store the gap from the previous member of the group. If there is no
2608              gap in the access, DR_GROUP_GAP is always 1.  */
2609           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2610
2611           prev_init = DR_INIT (data_ref);
2612           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2613           /* Count the number of data-refs in the chain.  */
2614           count++;
2615         }
2616
2617       if (groupsize == 0)
2618         groupsize = count + gaps;
2619
2620       /* This could be UINT_MAX but as we are generating code in a very
2621          inefficient way we have to cap earlier.  See PR78699 for example.  */
2622       if (groupsize > 4096)
2623         {
2624           if (dump_enabled_p ())
2625             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626                              "group is too large\n");
2627           return false;
2628         }
2629
2630       /* Check that the size of the interleaving is equal to count for stores,
2631          i.e., that there are no gaps.  */
2632       if (groupsize != count
2633           && !DR_IS_READ (dr))
2634         {
2635           if (dump_enabled_p ())
2636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637                              "interleaved store with gaps\n");
2638           return false;
2639         }
2640
2641       /* If there is a gap after the last load in the group it is the
2642          difference between the groupsize and the last accessed
2643          element.
2644          When there is no gap, this difference should be 0.  */
2645       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2646
2647       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2648       if (dump_enabled_p ())
2649         {
2650           dump_printf_loc (MSG_NOTE, vect_location,
2651                            "Detected interleaving ");
2652           if (DR_IS_READ (dr))
2653             dump_printf (MSG_NOTE, "load ");
2654           else
2655             dump_printf (MSG_NOTE, "store ");
2656           dump_printf (MSG_NOTE, "of size %u starting with ",
2657                        (unsigned)groupsize);
2658           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2659           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2660             dump_printf_loc (MSG_NOTE, vect_location,
2661                              "There is a gap of %u elements after the group\n",
2662                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2663         }
2664
2665       /* SLP: create an SLP data structure for every interleaving group of
2666          stores for further analysis in vect_analyse_slp.  */
2667       if (DR_IS_WRITE (dr) && !slp_impossible)
2668         {
2669           if (loop_vinfo)
2670             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2671           if (bb_vinfo)
2672             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2673         }
2674     }
2675
2676   return true;
2677 }
2678
2679 /* Analyze groups of accesses: check that DR belongs to a group of
2680    accesses of legal size, step, etc.  Detect gaps, single element
2681    interleaving, and other special cases. Set grouped access info.
2682    Collect groups of strided stores for further use in SLP analysis.  */
2683
2684 static bool
2685 vect_analyze_group_access (struct data_reference *dr)
2686 {
2687   if (!vect_analyze_group_access_1 (dr))
2688     {
2689       /* Dissolve the group if present.  */
2690       gimple *next;
2691       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2692       while (stmt)
2693         {
2694           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2695           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2696           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2697           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2698           stmt = next;
2699         }
2700       return false;
2701     }
2702   return true;
2703 }
2704
2705 /* Analyze the access pattern of the data-reference DR.
2706    In case of non-consecutive accesses call vect_analyze_group_access() to
2707    analyze groups of accesses.  */
2708
2709 static bool
2710 vect_analyze_data_ref_access (struct data_reference *dr)
2711 {
2712   tree step = DR_STEP (dr);
2713   tree scalar_type = TREE_TYPE (DR_REF (dr));
2714   gimple *stmt = vect_dr_stmt (dr);
2715   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2717   struct loop *loop = NULL;
2718
2719   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2720     return true;
2721
2722   if (loop_vinfo)
2723     loop = LOOP_VINFO_LOOP (loop_vinfo);
2724
2725   if (loop_vinfo && !step)
2726     {
2727       if (dump_enabled_p ())
2728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729                          "bad data-ref access in loop\n");
2730       return false;
2731     }
2732
2733   /* Allow loads with zero step in inner-loop vectorization.  */
2734   if (loop_vinfo && integer_zerop (step))
2735     {
2736       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2737       if (!nested_in_vect_loop_p (loop, stmt))
2738         return DR_IS_READ (dr);
2739       /* Allow references with zero step for outer loops marked
2740          with pragma omp simd only - it guarantees absence of
2741          loop-carried dependencies between inner loop iterations.  */
2742       if (loop->safelen < 2)
2743         {
2744           if (dump_enabled_p ())
2745             dump_printf_loc (MSG_NOTE, vect_location,
2746                              "zero step in inner loop of nest\n");
2747           return false;
2748         }
2749     }
2750
2751   if (loop && nested_in_vect_loop_p (loop, stmt))
2752     {
2753       /* Interleaved accesses are not yet supported within outer-loop
2754         vectorization for references in the inner-loop.  */
2755       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2756
2757       /* For the rest of the analysis we use the outer-loop step.  */
2758       step = STMT_VINFO_DR_STEP (stmt_info);
2759       if (integer_zerop (step))
2760         {
2761           if (dump_enabled_p ())
2762             dump_printf_loc (MSG_NOTE, vect_location,
2763                              "zero step in outer loop.\n");
2764           return DR_IS_READ (dr);
2765         }
2766     }
2767
2768   /* Consecutive?  */
2769   if (TREE_CODE (step) == INTEGER_CST)
2770     {
2771       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2772       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2773           || (dr_step < 0
2774               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2775         {
2776           /* Mark that it is not interleaving.  */
2777           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2778           return true;
2779         }
2780     }
2781
2782   if (loop && nested_in_vect_loop_p (loop, stmt))
2783     {
2784       if (dump_enabled_p ())
2785         dump_printf_loc (MSG_NOTE, vect_location,
2786                          "grouped access in outer loop.\n");
2787       return false;
2788     }
2789
2790
2791   /* Assume this is a DR handled by non-constant strided load case.  */
2792   if (TREE_CODE (step) != INTEGER_CST)
2793     return (STMT_VINFO_STRIDED_P (stmt_info)
2794             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2795                 || vect_analyze_group_access (dr)));
2796
2797   /* Not consecutive access - check if it's a part of interleaving group.  */
2798   return vect_analyze_group_access (dr);
2799 }
2800
2801 /* Compare two data-references DRA and DRB to group them into chunks
2802    suitable for grouping.  */
2803
2804 static int
2805 dr_group_sort_cmp (const void *dra_, const void *drb_)
2806 {
2807   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2808   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2809   int cmp;
2810
2811   /* Stabilize sort.  */
2812   if (dra == drb)
2813     return 0;
2814
2815   /* DRs in different loops never belong to the same group.  */
2816   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2817   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2818   if (loopa != loopb)
2819     return loopa->num < loopb->num ? -1 : 1;
2820
2821   /* Ordering of DRs according to base.  */
2822   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2823                                DR_BASE_ADDRESS (drb));
2824   if (cmp != 0)
2825     return cmp;
2826
2827   /* And according to DR_OFFSET.  */
2828   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2829   if (cmp != 0)
2830     return cmp;
2831
2832   /* Put reads before writes.  */
2833   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2834     return DR_IS_READ (dra) ? -1 : 1;
2835
2836   /* Then sort after access size.  */
2837   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2838                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2839   if (cmp != 0)
2840     return cmp;
2841
2842   /* And after step.  */
2843   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2844   if (cmp != 0)
2845     return cmp;
2846
2847   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2848   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2849   if (cmp == 0)
2850     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2851   return cmp;
2852 }
2853
2854 /* If OP is the result of a conversion, return the unconverted value,
2855    otherwise return null.  */
2856
2857 static tree
2858 strip_conversion (tree op)
2859 {
2860   if (TREE_CODE (op) != SSA_NAME)
2861     return NULL_TREE;
2862   gimple *stmt = SSA_NAME_DEF_STMT (op);
2863   if (!is_gimple_assign (stmt)
2864       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2865     return NULL_TREE;
2866   return gimple_assign_rhs1 (stmt);
2867 }
2868
2869 /* Return true if vectorizable_* routines can handle statements STMT1
2870    and STMT2 being in a single group.  */
2871
2872 static bool
2873 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2874 {
2875   if (gimple_assign_single_p (stmt1))
2876     return gimple_assign_single_p (stmt2);
2877
2878   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2879     {
2880       /* Check for two masked loads or two masked stores.  */
2881       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2882         return false;
2883       internal_fn ifn = gimple_call_internal_fn (stmt1);
2884       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885         return false;
2886       if (ifn != gimple_call_internal_fn (stmt2))
2887         return false;
2888
2889       /* Check that the masks are the same.  Cope with casts of masks,
2890          like those created by build_mask_conversion.  */
2891       tree mask1 = gimple_call_arg (stmt1, 2);
2892       tree mask2 = gimple_call_arg (stmt2, 2);
2893       if (!operand_equal_p (mask1, mask2, 0))
2894         {
2895           mask1 = strip_conversion (mask1);
2896           if (!mask1)
2897             return false;
2898           mask2 = strip_conversion (mask2);
2899           if (!mask2)
2900             return false;
2901           if (!operand_equal_p (mask1, mask2, 0))
2902             return false;
2903         }
2904       return true;
2905     }
2906
2907   return false;
2908 }
2909
2910 /* Function vect_analyze_data_ref_accesses.
2911
2912    Analyze the access pattern of all the data references in the loop.
2913
2914    FORNOW: the only access pattern that is considered vectorizable is a
2915            simple step 1 (consecutive) access.
2916
2917    FORNOW: handle only arrays and pointer accesses.  */
2918
2919 bool
2920 vect_analyze_data_ref_accesses (vec_info *vinfo)
2921 {
2922   unsigned int i;
2923   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2924   struct data_reference *dr;
2925
2926   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2927
2928   if (datarefs.is_empty ())
2929     return true;
2930
2931   /* Sort the array of datarefs to make building the interleaving chains
2932      linear.  Don't modify the original vector's order, it is needed for
2933      determining what dependencies are reversed.  */
2934   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2935   datarefs_copy.qsort (dr_group_sort_cmp);
2936
2937   /* Build the interleaving chains.  */
2938   for (i = 0; i < datarefs_copy.length () - 1;)
2939     {
2940       data_reference_p dra = datarefs_copy[i];
2941       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2942       stmt_vec_info lastinfo = NULL;
2943       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2944           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2945         {
2946           ++i;
2947           continue;
2948         }
2949       for (i = i + 1; i < datarefs_copy.length (); ++i)
2950         {
2951           data_reference_p drb = datarefs_copy[i];
2952           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2953           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2954               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2955             break;
2956
2957           /* ???  Imperfect sorting (non-compatible types, non-modulo
2958              accesses, same accesses) can lead to a group to be artificially
2959              split here as we don't just skip over those.  If it really
2960              matters we can push those to a worklist and re-iterate
2961              over them.  The we can just skip ahead to the next DR here.  */
2962
2963           /* DRs in a different loop should not be put into the same
2964              interleaving group.  */
2965           if (gimple_bb (DR_STMT (dra))->loop_father
2966               != gimple_bb (DR_STMT (drb))->loop_father)
2967             break;
2968
2969           /* Check that the data-refs have same first location (except init)
2970              and they are both either store or load (not load and store,
2971              not masked loads or stores).  */
2972           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2973               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2974                                         DR_BASE_ADDRESS (drb)) != 0
2975               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2976               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2977             break;
2978
2979           /* Check that the data-refs have the same constant size.  */
2980           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2981           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2982           if (!tree_fits_uhwi_p (sza)
2983               || !tree_fits_uhwi_p (szb)
2984               || !tree_int_cst_equal (sza, szb))
2985             break;
2986
2987           /* Check that the data-refs have the same step.  */
2988           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2989             break;
2990
2991           /* Check the types are compatible.
2992              ???  We don't distinguish this during sorting.  */
2993           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2994                                    TREE_TYPE (DR_REF (drb))))
2995             break;
2996
2997           /* Check that the DR_INITs are compile-time constants.  */
2998           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2999               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3000             break;
3001
3002           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3003           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3004           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3005           HOST_WIDE_INT init_prev
3006             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3007           gcc_assert (init_a <= init_b
3008                       && init_a <= init_prev
3009                       && init_prev <= init_b);
3010
3011           /* Do not place the same access in the interleaving chain twice.  */
3012           if (init_b == init_prev)
3013             {
3014               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3015                           < gimple_uid (DR_STMT (drb)));
3016               /* ???  For now we simply "drop" the later reference which is
3017                  otherwise the same rather than finishing off this group.
3018                  In the end we'd want to re-process duplicates forming
3019                  multiple groups from the refs, likely by just collecting
3020                  all candidates (including duplicates and split points
3021                  below) in a vector and then process them together.  */
3022               continue;
3023             }
3024
3025           /* If init_b == init_a + the size of the type * k, we have an
3026              interleaving, and DRA is accessed before DRB.  */
3027           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3028           if (type_size_a == 0
3029               || (init_b - init_a) % type_size_a != 0)
3030             break;
3031
3032           /* If we have a store, the accesses are adjacent.  This splits
3033              groups into chunks we support (we don't support vectorization
3034              of stores with gaps).  */
3035           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3036             break;
3037
3038           /* If the step (if not zero or non-constant) is greater than the
3039              difference between data-refs' inits this splits groups into
3040              suitable sizes.  */
3041           if (tree_fits_shwi_p (DR_STEP (dra)))
3042             {
3043               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3044               if (step != 0 && step <= (init_b - init_a))
3045                 break;
3046             }
3047
3048           if (dump_enabled_p ())
3049             {
3050               dump_printf_loc (MSG_NOTE, vect_location,
3051                                "Detected interleaving ");
3052               if (DR_IS_READ (dra))
3053                 dump_printf (MSG_NOTE, "load ");
3054               else
3055                 dump_printf (MSG_NOTE, "store ");
3056               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3057               dump_printf (MSG_NOTE,  " and ");
3058               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3059               dump_printf (MSG_NOTE, "\n");
3060             }
3061
3062           /* Link the found element into the group list.  */
3063           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3064             {
3065               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3066               lastinfo = stmtinfo_a;
3067             }
3068           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3069           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3070           lastinfo = stmtinfo_b;
3071         }
3072     }
3073
3074   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3075     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3076         && !vect_analyze_data_ref_access (dr))
3077       {
3078         if (dump_enabled_p ())
3079           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080                            "not vectorized: complicated access pattern.\n");
3081
3082         if (is_a <bb_vec_info> (vinfo))
3083           {
3084             /* Mark the statement as not vectorizable.  */
3085             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3086             continue;
3087           }
3088         else
3089           {
3090             datarefs_copy.release ();
3091             return false;
3092           }
3093       }
3094
3095   datarefs_copy.release ();
3096   return true;
3097 }
3098
3099 /* Function vect_vfa_segment_size.
3100
3101    Input:
3102      DR: The data reference.
3103      LENGTH_FACTOR: segment length to consider.
3104
3105    Return a value suitable for the dr_with_seg_len::seg_len field.
3106    This is the "distance travelled" by the pointer from the first
3107    iteration in the segment to the last.  Note that it does not include
3108    the size of the access; in effect it only describes the first byte.  */
3109
3110 static tree
3111 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3112 {
3113   length_factor = size_binop (MINUS_EXPR,
3114                               fold_convert (sizetype, length_factor),
3115                               size_one_node);
3116   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3117                      length_factor);
3118 }
3119
3120 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3121    gives the worst-case number of bytes covered by the segment.  */
3122
3123 static unsigned HOST_WIDE_INT
3124 vect_vfa_access_size (data_reference *dr)
3125 {
3126   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3127   tree ref_type = TREE_TYPE (DR_REF (dr));
3128   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3129   unsigned HOST_WIDE_INT access_size = ref_size;
3130   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3131     {
3132       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3133       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3134     }
3135   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3136       && (vect_supportable_dr_alignment (dr, false)
3137           == dr_explicit_realign_optimized))
3138     {
3139       /* We might access a full vector's worth.  */
3140       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3141       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3142     }
3143   return access_size;
3144 }
3145
3146 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3147
3148 static unsigned int
3149 vect_vfa_align (const data_reference *dr)
3150 {
3151   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3152 }
3153
3154 /* Function vect_no_alias_p.
3155
3156    Given data references A and B with equal base and offset, see whether
3157    the alias relation can be decided at compilation time.  Return 1 if
3158    it can and the references alias, 0 if it can and the references do
3159    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3160    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3161    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3162
3163 static int
3164 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3165                          tree segment_length_a, tree segment_length_b,
3166                          unsigned HOST_WIDE_INT access_size_a,
3167                          unsigned HOST_WIDE_INT access_size_b)
3168 {
3169   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3170   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3171   poly_uint64 const_length_a;
3172   poly_uint64 const_length_b;
3173
3174   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3175      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3176      [a, a+12) */
3177   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3178     {
3179       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3180       offset_a = (offset_a + access_size_a) - const_length_a;
3181     }
3182   else
3183     const_length_a = tree_to_poly_uint64 (segment_length_a);
3184   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3185     {
3186       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3187       offset_b = (offset_b + access_size_b) - const_length_b;
3188     }
3189   else
3190     const_length_b = tree_to_poly_uint64 (segment_length_b);
3191
3192   const_length_a += access_size_a;
3193   const_length_b += access_size_b;
3194
3195   if (ranges_known_overlap_p (offset_a, const_length_a,
3196                               offset_b, const_length_b))
3197     return 1;
3198
3199   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3200                                offset_b, const_length_b))
3201     return 0;
3202
3203   return -1;
3204 }
3205
3206 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3207    in DDR is >= VF.  */
3208
3209 static bool
3210 dependence_distance_ge_vf (data_dependence_relation *ddr,
3211                            unsigned int loop_depth, poly_uint64 vf)
3212 {
3213   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3214       || DDR_NUM_DIST_VECTS (ddr) == 0)
3215     return false;
3216
3217   /* If the dependence is exact, we should have limited the VF instead.  */
3218   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3219
3220   unsigned int i;
3221   lambda_vector dist_v;
3222   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3223     {
3224       HOST_WIDE_INT dist = dist_v[loop_depth];
3225       if (dist != 0
3226           && !(dist > 0 && DDR_REVERSED_P (ddr))
3227           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3228         return false;
3229     }
3230
3231   if (dump_enabled_p ())
3232     {
3233       dump_printf_loc (MSG_NOTE, vect_location,
3234                        "dependence distance between ");
3235       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3236       dump_printf (MSG_NOTE,  " and ");
3237       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3238       dump_printf (MSG_NOTE,  " is >= VF\n");
3239     }
3240
3241   return true;
3242 }
3243
3244 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3245
3246 static void
3247 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3248 {
3249   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3250   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3251   dump_printf (dump_kind, ") >= ");
3252   dump_dec (dump_kind, lower_bound.min_value);
3253 }
3254
3255 /* Record that the vectorized loop requires the vec_lower_bound described
3256    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3257
3258 static void
3259 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3260                         poly_uint64 min_value)
3261 {
3262   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3263   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3264     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3265       {
3266         unsigned_p &= lower_bounds[i].unsigned_p;
3267         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3268         if (lower_bounds[i].unsigned_p != unsigned_p
3269             || maybe_lt (lower_bounds[i].min_value, min_value))
3270           {
3271             lower_bounds[i].unsigned_p = unsigned_p;
3272             lower_bounds[i].min_value = min_value;
3273             if (dump_enabled_p ())
3274               {
3275                 dump_printf_loc (MSG_NOTE, vect_location,
3276                                  "updating run-time check to ");
3277                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3278                 dump_printf (MSG_NOTE, "\n");
3279               }
3280           }
3281         return;
3282       }
3283
3284   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3285   if (dump_enabled_p ())
3286     {
3287       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3288       dump_lower_bound (MSG_NOTE, lower_bound);
3289       dump_printf (MSG_NOTE, "\n");
3290     }
3291   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3292 }
3293
3294 /* Return true if it's unlikely that the step of the vectorized form of DR
3295    will span fewer than GAP bytes.  */
3296
3297 static bool
3298 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3299 {
3300   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3301   HOST_WIDE_INT count
3302     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3303   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3304     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3305   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3306 }
3307
3308 /* Return true if we know that there is no alias between DR_A and DR_B
3309    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3310    *LOWER_BOUND_OUT to this N.  */
3311
3312 static bool
3313 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3314                                 poly_uint64 *lower_bound_out)
3315 {
3316   /* Check that there is a constant gap of known sign between DR_A
3317      and DR_B.  */
3318   poly_int64 init_a, init_b;
3319   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3320       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3321       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3322       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3323       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3324       || !ordered_p (init_a, init_b))
3325     return false;
3326
3327   /* Sort DR_A and DR_B by the address they access.  */
3328   if (maybe_lt (init_b, init_a))
3329     {
3330       std::swap (init_a, init_b);
3331       std::swap (dr_a, dr_b);
3332     }
3333
3334   /* If the two accesses could be dependent within a scalar iteration,
3335      make sure that we'd retain their order.  */
3336   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3337       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3338                                          vect_dr_stmt (dr_b)))
3339     return false;
3340
3341   /* There is no alias if abs (DR_STEP) is greater than or equal to
3342      the bytes spanned by the combination of the two accesses.  */
3343   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3344   return true;
3345 }
3346
3347 /* Function vect_prune_runtime_alias_test_list.
3348
3349    Prune a list of ddrs to be tested at run-time by versioning for alias.
3350    Merge several alias checks into one if possible.
3351    Return FALSE if resulting list of ddrs is longer then allowed by
3352    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3353
3354 bool
3355 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3356 {
3357   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3358   hash_set <tree_pair_hash> compared_objects;
3359
3360   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3361   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3362     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3363   vec<vec_object_pair> &check_unequal_addrs
3364     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3365   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3366   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3367
3368   ddr_p ddr;
3369   unsigned int i;
3370   tree length_factor;
3371
3372   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3373
3374   /* Step values are irrelevant for aliasing if the number of vector
3375      iterations is equal to the number of scalar iterations (which can
3376      happen for fully-SLP loops).  */
3377   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3378
3379   if (!ignore_step_p)
3380     {
3381       /* Convert the checks for nonzero steps into bound tests.  */
3382       tree value;
3383       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3384         vect_check_lower_bound (loop_vinfo, value, true, 1);
3385     }
3386
3387   if (may_alias_ddrs.is_empty ())
3388     return true;
3389
3390   comp_alias_ddrs.create (may_alias_ddrs.length ());
3391
3392   unsigned int loop_depth
3393     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3394                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3395
3396   /* First, we collect all data ref pairs for aliasing checks.  */
3397   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3398     {
3399       int comp_res;
3400       poly_uint64 lower_bound;
3401       struct data_reference *dr_a, *dr_b;
3402       gimple *dr_group_first_a, *dr_group_first_b;
3403       tree segment_length_a, segment_length_b;
3404       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3405       unsigned int align_a, align_b;
3406       gimple *stmt_a, *stmt_b;
3407
3408       /* Ignore the alias if the VF we chose ended up being no greater
3409          than the dependence distance.  */
3410       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3411         continue;
3412
3413       if (DDR_OBJECT_A (ddr))
3414         {
3415           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3416           if (!compared_objects.add (new_pair))
3417             {
3418               if (dump_enabled_p ())
3419                 {
3420                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3421                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3422                   dump_printf (MSG_NOTE, " and ");
3423                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3424                   dump_printf (MSG_NOTE, " have different addresses\n");
3425                 }
3426               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3427             }
3428           continue;
3429         }
3430
3431       dr_a = DDR_A (ddr);
3432       stmt_a = vect_dr_stmt (DDR_A (ddr));
3433
3434       dr_b = DDR_B (ddr);
3435       stmt_b = vect_dr_stmt (DDR_B (ddr));
3436
3437       /* Skip the pair if inter-iteration dependencies are irrelevant
3438          and intra-iteration dependencies are guaranteed to be honored.  */
3439       if (ignore_step_p
3440           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3441               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3442         {
3443           if (dump_enabled_p ())
3444             {
3445               dump_printf_loc (MSG_NOTE, vect_location,
3446                                "no need for alias check between ");
3447               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3448               dump_printf (MSG_NOTE, " and ");
3449               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3450               dump_printf (MSG_NOTE, " when VF is 1\n");
3451             }
3452           continue;
3453         }
3454
3455       /* See whether we can handle the alias using a bounds check on
3456          the step, and whether that's likely to be the best approach.
3457          (It might not be, for example, if the minimum step is much larger
3458          than the number of bytes handled by one vector iteration.)  */
3459       if (!ignore_step_p
3460           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3461           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3462           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3463               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3464         {
3465           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3466           if (dump_enabled_p ())
3467             {
3468               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3469               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3470               dump_printf (MSG_NOTE, " and ");
3471               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3472               dump_printf (MSG_NOTE, " when the step ");
3473               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3474               dump_printf (MSG_NOTE, " is outside ");
3475               if (unsigned_p)
3476                 dump_printf (MSG_NOTE, "[0");
3477               else
3478                 {
3479                   dump_printf (MSG_NOTE, "(");
3480                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3481                 }
3482               dump_printf (MSG_NOTE, ", ");
3483               dump_dec (MSG_NOTE, lower_bound);
3484               dump_printf (MSG_NOTE, ")\n");
3485             }
3486           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3487                                   lower_bound);
3488           continue;
3489         }
3490
3491       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3492       if (dr_group_first_a)
3493         {
3494           stmt_a = dr_group_first_a;
3495           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3496         }
3497
3498       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3499       if (dr_group_first_b)
3500         {
3501           stmt_b = dr_group_first_b;
3502           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3503         }
3504
3505       if (ignore_step_p)
3506         {
3507           segment_length_a = size_zero_node;
3508           segment_length_b = size_zero_node;
3509         }
3510       else
3511         {
3512           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3513             length_factor = scalar_loop_iters;
3514           else
3515             length_factor = size_int (vect_factor);
3516           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3517           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3518         }
3519       access_size_a = vect_vfa_access_size (dr_a);
3520       access_size_b = vect_vfa_access_size (dr_b);
3521       align_a = vect_vfa_align (dr_a);
3522       align_b = vect_vfa_align (dr_b);
3523
3524       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3525                                         DR_BASE_ADDRESS (dr_b));
3526       if (comp_res == 0)
3527         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3528                                           DR_OFFSET (dr_b));
3529
3530       /* See whether the alias is known at compilation time.  */
3531       if (comp_res == 0
3532           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3533           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3534           && poly_int_tree_p (segment_length_a)
3535           && poly_int_tree_p (segment_length_b))
3536         {
3537           int res = vect_compile_time_alias (dr_a, dr_b,
3538                                              segment_length_a,
3539                                              segment_length_b,
3540                                              access_size_a,
3541                                              access_size_b);
3542           if (res >= 0 && dump_enabled_p ())
3543             {
3544               dump_printf_loc (MSG_NOTE, vect_location,
3545                                "can tell at compile time that ");
3546               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3547               dump_printf (MSG_NOTE, " and ");
3548               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3549               if (res == 0)
3550                 dump_printf (MSG_NOTE, " do not alias\n");
3551               else
3552                 dump_printf (MSG_NOTE, " alias\n");
3553             }
3554
3555           if (res == 0)
3556             continue;
3557
3558           if (res == 1)
3559             {
3560               if (dump_enabled_p ())
3561                 dump_printf_loc (MSG_NOTE, vect_location,
3562                                  "not vectorized: compilation time alias.\n");
3563               return false;
3564             }
3565         }
3566
3567       dr_with_seg_len_pair_t dr_with_seg_len_pair
3568         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3569          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3570
3571       /* Canonicalize pairs by sorting the two DR members.  */
3572       if (comp_res > 0)
3573         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3574
3575       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3576     }
3577
3578   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3579
3580   unsigned int count = (comp_alias_ddrs.length ()
3581                         + check_unequal_addrs.length ());
3582
3583   dump_printf_loc (MSG_NOTE, vect_location,
3584                    "improved number of alias checks from %d to %d\n",
3585                    may_alias_ddrs.length (), count);
3586   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3587     {
3588       if (dump_enabled_p ())
3589         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590                          "number of versioning for alias "
3591                          "run-time tests exceeds %d "
3592                          "(--param vect-max-version-for-alias-checks)\n",
3593                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3594       return false;
3595     }
3596
3597   return true;
3598 }
3599
3600 /* Check whether we can use an internal function for a gather load
3601    or scatter store.  READ_P is true for loads and false for stores.
3602    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3603    the type of the memory elements being loaded or stored.  OFFSET_BITS
3604    is the number of bits in each scalar offset and OFFSET_SIGN is the
3605    sign of the offset.  SCALE is the amount by which the offset should
3606    be multiplied *after* it has been converted to address width.
3607
3608    Return true if the function is supported, storing the function
3609    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3610
3611 bool
3612 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3613                           tree memory_type, unsigned int offset_bits,
3614                           signop offset_sign, int scale,
3615                           internal_fn *ifn_out, tree *element_type_out)
3616 {
3617   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3618   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3619   if (offset_bits > element_bits)
3620     /* Internal functions require the offset to be the same width as
3621        the vector elements.  We can extend narrower offsets, but it isn't
3622        safe to truncate wider offsets.  */
3623     return false;
3624
3625   if (element_bits != memory_bits)
3626     /* For now the vector elements must be the same width as the
3627        memory elements.  */
3628     return false;
3629
3630   /* Work out which function we need.  */
3631   internal_fn ifn;
3632   if (read_p)
3633     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3634   else
3635     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3636
3637   /* Test whether the target supports this combination.  */
3638   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3639                                                offset_sign, scale))
3640     return false;
3641
3642   *ifn_out = ifn;
3643   *element_type_out = TREE_TYPE (vectype);
3644   return true;
3645 }
3646
3647 /* CALL is a call to an internal gather load or scatter store function.
3648    Describe the operation in INFO.  */
3649
3650 static void
3651 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3652 {
3653   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3654   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3655   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656
3657   info->ifn = gimple_call_internal_fn (call);
3658   info->decl = NULL_TREE;
3659   info->base = gimple_call_arg (call, 0);
3660   info->offset = gimple_call_arg (call, 1);
3661   info->offset_dt = vect_unknown_def_type;
3662   info->offset_vectype = NULL_TREE;
3663   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3664   info->element_type = TREE_TYPE (vectype);
3665   info->memory_type = TREE_TYPE (DR_REF (dr));
3666 }
3667
3668 /* Return true if a non-affine read or write in STMT is suitable for a
3669    gather load or scatter store.  Describe the operation in *INFO if so.  */
3670
3671 bool
3672 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3673                            gather_scatter_info *info)
3674 {
3675   HOST_WIDE_INT scale = 1;
3676   poly_int64 pbitpos, pbitsize;
3677   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3678   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3679   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3680   tree offtype = NULL_TREE;
3681   tree decl = NULL_TREE, base, off;
3682   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3683   tree memory_type = TREE_TYPE (DR_REF (dr));
3684   machine_mode pmode;
3685   int punsignedp, reversep, pvolatilep = 0;
3686   internal_fn ifn;
3687   tree element_type;
3688   bool masked_p = false;
3689
3690   /* See whether this is already a call to a gather/scatter internal function.
3691      If not, see whether it's a masked load or store.  */
3692   gcall *call = dyn_cast <gcall *> (stmt);
3693   if (call && gimple_call_internal_p (call))
3694     {
3695       ifn = gimple_call_internal_fn (stmt);
3696       if (internal_gather_scatter_fn_p (ifn))
3697         {
3698           vect_describe_gather_scatter_call (call, info);
3699           return true;
3700         }
3701       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3702     }
3703
3704   /* True if we should aim to use internal functions rather than
3705      built-in functions.  */
3706   bool use_ifn_p = (DR_IS_READ (dr)
3707                     ? supports_vec_gather_load_p ()
3708                     : supports_vec_scatter_store_p ());
3709
3710   base = DR_REF (dr);
3711   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3712      see if we can use the def stmt of the address.  */
3713   if (masked_p
3714       && TREE_CODE (base) == MEM_REF
3715       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3716       && integer_zerop (TREE_OPERAND (base, 1))
3717       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3718     {
3719       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3720       if (is_gimple_assign (def_stmt)
3721           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3722         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3723     }
3724
3725   /* The gather and scatter builtins need address of the form
3726      loop_invariant + vector * {1, 2, 4, 8}
3727      or
3728      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3729      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3730      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3731      multiplications and additions in it.  To get a vector, we need
3732      a single SSA_NAME that will be defined in the loop and will
3733      contain everything that is not loop invariant and that can be
3734      vectorized.  The following code attempts to find such a preexistng
3735      SSA_NAME OFF and put the loop invariants into a tree BASE
3736      that can be gimplified before the loop.  */
3737   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3738                               &punsignedp, &reversep, &pvolatilep);
3739   if (reversep)
3740     return false;
3741
3742   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3743
3744   if (TREE_CODE (base) == MEM_REF)
3745     {
3746       if (!integer_zerop (TREE_OPERAND (base, 1)))
3747         {
3748           if (off == NULL_TREE)
3749             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3750           else
3751             off = size_binop (PLUS_EXPR, off,
3752                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3753         }
3754       base = TREE_OPERAND (base, 0);
3755     }
3756   else
3757     base = build_fold_addr_expr (base);
3758
3759   if (off == NULL_TREE)
3760     off = size_zero_node;
3761
3762   /* If base is not loop invariant, either off is 0, then we start with just
3763      the constant offset in the loop invariant BASE and continue with base
3764      as OFF, otherwise give up.
3765      We could handle that case by gimplifying the addition of base + off
3766      into some SSA_NAME and use that as off, but for now punt.  */
3767   if (!expr_invariant_in_loop_p (loop, base))
3768     {
3769       if (!integer_zerop (off))
3770         return false;
3771       off = base;
3772       base = size_int (pbytepos);
3773     }
3774   /* Otherwise put base + constant offset into the loop invariant BASE
3775      and continue with OFF.  */
3776   else
3777     {
3778       base = fold_convert (sizetype, base);
3779       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3780     }
3781
3782   /* OFF at this point may be either a SSA_NAME or some tree expression
3783      from get_inner_reference.  Try to peel off loop invariants from it
3784      into BASE as long as possible.  */
3785   STRIP_NOPS (off);
3786   while (offtype == NULL_TREE)
3787     {
3788       enum tree_code code;
3789       tree op0, op1, add = NULL_TREE;
3790
3791       if (TREE_CODE (off) == SSA_NAME)
3792         {
3793           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3794
3795           if (expr_invariant_in_loop_p (loop, off))
3796             return false;
3797
3798           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3799             break;
3800
3801           op0 = gimple_assign_rhs1 (def_stmt);
3802           code = gimple_assign_rhs_code (def_stmt);
3803           op1 = gimple_assign_rhs2 (def_stmt);
3804         }
3805       else
3806         {
3807           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3808             return false;
3809           code = TREE_CODE (off);
3810           extract_ops_from_tree (off, &code, &op0, &op1);
3811         }
3812       switch (code)
3813         {
3814         case POINTER_PLUS_EXPR:
3815         case PLUS_EXPR:
3816           if (expr_invariant_in_loop_p (loop, op0))
3817             {
3818               add = op0;
3819               off = op1;
3820             do_add:
3821               add = fold_convert (sizetype, add);
3822               if (scale != 1)
3823                 add = size_binop (MULT_EXPR, add, size_int (scale));
3824               base = size_binop (PLUS_EXPR, base, add);
3825               continue;
3826             }
3827           if (expr_invariant_in_loop_p (loop, op1))
3828             {
3829               add = op1;
3830               off = op0;
3831               goto do_add;
3832             }
3833           break;
3834         case MINUS_EXPR:
3835           if (expr_invariant_in_loop_p (loop, op1))
3836             {
3837               add = fold_convert (sizetype, op1);
3838               add = size_binop (MINUS_EXPR, size_zero_node, add);
3839               off = op0;
3840               goto do_add;
3841             }
3842           break;
3843         case MULT_EXPR:
3844           if (scale == 1 && tree_fits_shwi_p (op1))
3845             {
3846               int new_scale = tree_to_shwi (op1);
3847               /* Only treat this as a scaling operation if the target
3848                  supports it.  */
3849               if (use_ifn_p
3850                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3851                                                 vectype, memory_type, 1,
3852                                                 TYPE_SIGN (TREE_TYPE (op0)),
3853                                                 new_scale, &ifn,
3854                                                 &element_type))
3855                 break;
3856               scale = new_scale;
3857               off = op0;
3858               continue;
3859             }
3860           break;
3861         case SSA_NAME:
3862           off = op0;
3863           continue;
3864         CASE_CONVERT:
3865           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3866               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3867             break;
3868           if (TYPE_PRECISION (TREE_TYPE (op0))
3869               == TYPE_PRECISION (TREE_TYPE (off)))
3870             {
3871               off = op0;
3872               continue;
3873             }
3874
3875           /* The internal functions need the offset to be the same width
3876              as the elements of VECTYPE.  Don't include operations that
3877              cast the offset from that width to a different width.  */
3878           if (use_ifn_p
3879               && (int_size_in_bytes (TREE_TYPE (vectype))
3880                   == int_size_in_bytes (TREE_TYPE (off))))
3881             break;
3882
3883           if (TYPE_PRECISION (TREE_TYPE (op0))
3884               < TYPE_PRECISION (TREE_TYPE (off)))
3885             {
3886               off = op0;
3887               offtype = TREE_TYPE (off);
3888               STRIP_NOPS (off);
3889               continue;
3890             }
3891           break;
3892         default:
3893           break;
3894         }
3895       break;
3896     }
3897
3898   /* If at the end OFF still isn't a SSA_NAME or isn't
3899      defined in the loop, punt.  */
3900   if (TREE_CODE (off) != SSA_NAME
3901       || expr_invariant_in_loop_p (loop, off))
3902     return false;
3903
3904   if (offtype == NULL_TREE)
3905     offtype = TREE_TYPE (off);
3906
3907   if (use_ifn_p)
3908     {
3909       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3910                                      memory_type, TYPE_PRECISION (offtype),
3911                                      TYPE_SIGN (offtype), scale, &ifn,
3912                                      &element_type))
3913         return false;
3914     }
3915   else
3916     {
3917       if (DR_IS_READ (dr))
3918         {
3919           if (targetm.vectorize.builtin_gather)
3920             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3921         }
3922       else
3923         {
3924           if (targetm.vectorize.builtin_scatter)
3925             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3926         }
3927
3928       if (!decl)
3929         return false;
3930
3931       ifn = IFN_LAST;
3932       element_type = TREE_TYPE (vectype);
3933     }
3934
3935   info->ifn = ifn;
3936   info->decl = decl;
3937   info->base = base;
3938   info->offset = off;
3939   info->offset_dt = vect_unknown_def_type;
3940   info->offset_vectype = NULL_TREE;
3941   info->scale = scale;
3942   info->element_type = element_type;
3943   info->memory_type = memory_type;
3944   return true;
3945 }
3946
3947 /* Find the data references in STMT, analyze them with respect to LOOP and
3948    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3949    be handled.  */
3950
3951 bool
3952 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3953                                vec<data_reference_p> *datarefs)
3954 {
3955   /* We can ignore clobbers for dataref analysis - they are removed during
3956      loop vectorization and BB vectorization checks dependences with a
3957      stmt walk.  */
3958   if (gimple_clobber_p (stmt))
3959     return true;
3960
3961   if (gimple_has_volatile_ops (stmt))
3962     {
3963       if (dump_enabled_p ())
3964         {
3965           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3966                            "not vectorized: volatile type ");
3967           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3968         }
3969       return false;
3970     }
3971
3972   if (stmt_can_throw_internal (stmt))
3973     {
3974       if (dump_enabled_p ())
3975         {
3976           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977                            "not vectorized: statement can throw an "
3978                            "exception ");
3979           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3980         }
3981       return false;
3982     }
3983
3984   auto_vec<data_reference_p, 2> refs;
3985   if (!find_data_references_in_stmt (loop, stmt, &refs))
3986     return false;
3987
3988   if (refs.is_empty ())
3989     return true;
3990
3991   if (refs.length () > 1)
3992     {
3993       if (dump_enabled_p ())
3994         {
3995           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3996                            "not vectorized: more than one data ref "
3997                            "in stmt: ");
3998           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3999         }
4000       return false;
4001     }
4002
4003   if (gcall *call = dyn_cast <gcall *> (stmt))
4004     if (!gimple_call_internal_p (call)
4005         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4006             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4007       {
4008         if (dump_enabled_p ())
4009           {
4010             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4011                              "not vectorized: dr in a call ");
4012             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4013           }
4014         return false;
4015       }
4016
4017   data_reference_p dr = refs.pop ();
4018   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4019       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4020     {
4021       if (dump_enabled_p ())
4022         {
4023           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4024                            "not vectorized: statement is bitfield "
4025                            "access ");
4026           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4027         }
4028       return false;
4029     }
4030
4031   if (DR_BASE_ADDRESS (dr)
4032       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4033     {
4034       if (dump_enabled_p ())
4035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4036                          "not vectorized: base addr of dr is a "
4037                          "constant\n");
4038       return false;
4039     }
4040
4041   /* Check whether this may be a SIMD lane access and adjust the
4042      DR to make it easier for us to handle it.  */
4043   if (loop
4044       && loop->simduid
4045       && (!DR_BASE_ADDRESS (dr)
4046           || !DR_OFFSET (dr)
4047           || !DR_INIT (dr)
4048           || !DR_STEP (dr)))
4049     {
4050       struct data_reference *newdr
4051         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4052                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4053       if (DR_BASE_ADDRESS (newdr)
4054           && DR_OFFSET (newdr)
4055           && DR_INIT (newdr)
4056           && DR_STEP (newdr)
4057           && integer_zerop (DR_STEP (newdr)))
4058         {
4059           tree off = DR_OFFSET (newdr);
4060           STRIP_NOPS (off);
4061           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4062               && TREE_CODE (off) == MULT_EXPR
4063               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4064             {
4065               tree step = TREE_OPERAND (off, 1);
4066               off = TREE_OPERAND (off, 0);
4067               STRIP_NOPS (off);
4068               if (CONVERT_EXPR_P (off)
4069                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4070                       < TYPE_PRECISION (TREE_TYPE (off))))
4071                 off = TREE_OPERAND (off, 0);
4072               if (TREE_CODE (off) == SSA_NAME)
4073                 {
4074                   gimple *def = SSA_NAME_DEF_STMT (off);
4075                   tree reft = TREE_TYPE (DR_REF (newdr));
4076                   if (is_gimple_call (def)
4077                       && gimple_call_internal_p (def)
4078                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4079                     {
4080                       tree arg = gimple_call_arg (def, 0);
4081                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4082                       arg = SSA_NAME_VAR (arg);
4083                       if (arg == loop->simduid
4084                           /* For now.  */
4085                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4086                         {
4087                           DR_OFFSET (newdr) = ssize_int (0);
4088                           DR_STEP (newdr) = step;
4089                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4090                           DR_STEP_ALIGNMENT (newdr)
4091                             = highest_pow2_factor (step);
4092                           /* Mark as simd-lane access.  */
4093                           newdr->aux = (void *)-1;
4094                           free_data_ref (dr);
4095                           datarefs->safe_push (newdr);
4096                           return true;
4097                         }
4098                     }
4099                 }
4100             }
4101         }
4102       free_data_ref (newdr);
4103     }
4104
4105   datarefs->safe_push (dr);
4106   return true;
4107 }
4108
4109 /* Function vect_analyze_data_refs.
4110
4111   Find all the data references in the loop or basic block.
4112
4113    The general structure of the analysis of data refs in the vectorizer is as
4114    follows:
4115    1- vect_analyze_data_refs(loop/bb): call
4116       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4117       in the loop/bb and their dependences.
4118    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4119    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4120    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4121
4122 */
4123
4124 bool
4125 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4126 {
4127   struct loop *loop = NULL;
4128   unsigned int i;
4129   struct data_reference *dr;
4130   tree scalar_type;
4131
4132   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4133
4134   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4135     loop = LOOP_VINFO_LOOP (loop_vinfo);
4136
4137   /* Go through the data-refs, check that the analysis succeeded.  Update
4138      pointer from stmt_vec_info struct to DR and vectype.  */
4139
4140   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4141   FOR_EACH_VEC_ELT (datarefs, i, dr)
4142     {
4143       gimple *stmt;
4144       stmt_vec_info stmt_info;
4145       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4146       poly_uint64 vf;
4147
4148       gcc_assert (DR_REF (dr));
4149       stmt = vect_dr_stmt (dr);
4150       stmt_info = vinfo_for_stmt (stmt);
4151
4152       /* Check that analysis of the data-ref succeeded.  */
4153       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4154           || !DR_STEP (dr))
4155         {
4156           bool maybe_gather
4157             = DR_IS_READ (dr)
4158               && !TREE_THIS_VOLATILE (DR_REF (dr))
4159               && (targetm.vectorize.builtin_gather != NULL
4160                   || supports_vec_gather_load_p ());
4161           bool maybe_scatter
4162             = DR_IS_WRITE (dr)
4163               && !TREE_THIS_VOLATILE (DR_REF (dr))
4164               && (targetm.vectorize.builtin_scatter != NULL
4165                   || supports_vec_scatter_store_p ());
4166
4167           /* If target supports vector gather loads or scatter stores,
4168              see if they can't be used.  */
4169           if (is_a <loop_vec_info> (vinfo)
4170               && !nested_in_vect_loop_p (loop, stmt))
4171             {
4172               if (maybe_gather || maybe_scatter)
4173                 {
4174                   if (maybe_gather)
4175                     gatherscatter = GATHER;
4176                   else
4177                     gatherscatter = SCATTER;
4178                 }
4179             }
4180
4181           if (gatherscatter == SG_NONE)
4182             {
4183               if (dump_enabled_p ())
4184                 {
4185                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4186                                    "not vectorized: data ref analysis "
4187                                    "failed ");
4188                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4189                 }
4190               if (is_a <bb_vec_info> (vinfo))
4191                 {
4192                   /* In BB vectorization the ref can still participate
4193                      in dependence analysis, we just can't vectorize it.  */
4194                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4195                   continue;
4196                 }
4197               return false;
4198             }
4199         }
4200
4201       /* See if this was detected as SIMD lane access.  */
4202       if (dr->aux == (void *)-1)
4203         {
4204           if (nested_in_vect_loop_p (loop, stmt))
4205             {
4206               if (dump_enabled_p ())
4207                 {
4208                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4209                                    "not vectorized: data ref analysis "
4210                                    "failed ");
4211                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4212                 }
4213               return false;
4214             }
4215           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4216         }
4217
4218       tree base = get_base_address (DR_REF (dr));
4219       if (base && VAR_P (base) && DECL_NONALIASED (base))
4220         {
4221           if (dump_enabled_p ())
4222             {
4223               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224                                "not vectorized: base object not addressable "
4225                                "for stmt: ");
4226               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4227             }
4228           if (is_a <bb_vec_info> (vinfo))
4229             {
4230               /* In BB vectorization the ref can still participate
4231                  in dependence analysis, we just can't vectorize it.  */
4232               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4233               continue;
4234             }
4235           return false;
4236         }
4237
4238       if (is_a <loop_vec_info> (vinfo)
4239           && DR_STEP (dr)
4240           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4241         {
4242           if (nested_in_vect_loop_p (loop, stmt))
4243             {
4244               if (dump_enabled_p ())
4245                 {
4246                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4247                                    "not vectorized: not suitable for strided "
4248                                    "load ");
4249                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4250                 }
4251               return false;
4252             }
4253           STMT_VINFO_STRIDED_P (stmt_info) = true;
4254         }
4255
4256       /* Update DR field in stmt_vec_info struct.  */
4257
4258       /* If the dataref is in an inner-loop of the loop that is considered for
4259          for vectorization, we also want to analyze the access relative to
4260          the outer-loop (DR contains information only relative to the
4261          inner-most enclosing loop).  We do that by building a reference to the
4262          first location accessed by the inner-loop, and analyze it relative to
4263          the outer-loop.  */
4264       if (loop && nested_in_vect_loop_p (loop, stmt))
4265         {
4266           /* Build a reference to the first location accessed by the
4267              inner loop: *(BASE + INIT + OFFSET).  By construction,
4268              this address must be invariant in the inner loop, so we
4269              can consider it as being used in the outer loop.  */
4270           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4271           tree offset = unshare_expr (DR_OFFSET (dr));
4272           tree init = unshare_expr (DR_INIT (dr));
4273           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4274                                           init, offset);
4275           tree init_addr = fold_build_pointer_plus (base, init_offset);
4276           tree init_ref = build_fold_indirect_ref (init_addr);
4277
4278           if (dump_enabled_p ())
4279             {
4280               dump_printf_loc (MSG_NOTE, vect_location,
4281                                "analyze in outer loop: ");
4282               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4283               dump_printf (MSG_NOTE, "\n");
4284             }
4285
4286           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4287                                      init_ref, loop))
4288             /* dr_analyze_innermost already explained the failure.  */
4289             return false;
4290
4291           if (dump_enabled_p ())
4292             {
4293               dump_printf_loc (MSG_NOTE, vect_location,
4294                                "\touter base_address: ");
4295               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4296                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4297               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4298               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4299                                  STMT_VINFO_DR_OFFSET (stmt_info));
4300               dump_printf (MSG_NOTE,
4301                            "\n\touter constant offset from base address: ");
4302               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4303                                  STMT_VINFO_DR_INIT (stmt_info));
4304               dump_printf (MSG_NOTE, "\n\touter step: ");
4305               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4306                                  STMT_VINFO_DR_STEP (stmt_info));
4307               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4308                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4309               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4310                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4311               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4312                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4313               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4314                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4315             }
4316         }
4317
4318       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4319       STMT_VINFO_DATA_REF (stmt_info) = dr;
4320
4321       /* Set vectype for STMT.  */
4322       scalar_type = TREE_TYPE (DR_REF (dr));
4323       STMT_VINFO_VECTYPE (stmt_info)
4324         = get_vectype_for_scalar_type (scalar_type);
4325       if (!STMT_VINFO_VECTYPE (stmt_info))
4326         {
4327           if (dump_enabled_p ())
4328             {
4329               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330                                "not vectorized: no vectype for stmt: ");
4331               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4332               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4333               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4334                                  scalar_type);
4335               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4336             }
4337
4338           if (is_a <bb_vec_info> (vinfo))
4339             {
4340               /* No vector type is fine, the ref can still participate
4341                  in dependence analysis, we just can't vectorize it.  */
4342               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4343               continue;
4344             }
4345           return false;
4346         }
4347       else
4348         {
4349           if (dump_enabled_p ())
4350             {
4351               dump_printf_loc (MSG_NOTE, vect_location,
4352                                "got vectype for stmt: ");
4353               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4354               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4355                                  STMT_VINFO_VECTYPE (stmt_info));
4356               dump_printf (MSG_NOTE, "\n");
4357             }
4358         }
4359
4360       /* Adjust the minimal vectorization factor according to the
4361          vector type.  */
4362       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4363       *min_vf = upper_bound (*min_vf, vf);
4364
4365       if (gatherscatter != SG_NONE)
4366         {
4367           gather_scatter_info gs_info;
4368           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4369                                           &gs_info)
4370               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4371             {
4372               if (dump_enabled_p ())
4373                 {
4374                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4375                                    (gatherscatter == GATHER) ?
4376                                    "not vectorized: not suitable for gather "
4377                                    "load " :
4378                                    "not vectorized: not suitable for scatter "
4379                                    "store ");
4380                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4381                 }
4382               return false;
4383             }
4384           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4385         }
4386     }
4387
4388   /* We used to stop processing and prune the list here.  Verify we no
4389      longer need to.  */
4390   gcc_assert (i == datarefs.length ());
4391
4392   return true;
4393 }
4394
4395
4396 /* Function vect_get_new_vect_var.
4397
4398    Returns a name for a new variable.  The current naming scheme appends the
4399    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4400    the name of vectorizer generated variables, and appends that to NAME if
4401    provided.  */
4402
4403 tree
4404 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4405 {
4406   const char *prefix;
4407   tree new_vect_var;
4408
4409   switch (var_kind)
4410   {
4411   case vect_simple_var:
4412     prefix = "vect";
4413     break;
4414   case vect_scalar_var:
4415     prefix = "stmp";
4416     break;
4417   case vect_mask_var:
4418     prefix = "mask";
4419     break;
4420   case vect_pointer_var:
4421     prefix = "vectp";
4422     break;
4423   default:
4424     gcc_unreachable ();
4425   }
4426
4427   if (name)
4428     {
4429       char* tmp = concat (prefix, "_", name, NULL);
4430       new_vect_var = create_tmp_reg (type, tmp);
4431       free (tmp);
4432     }
4433   else
4434     new_vect_var = create_tmp_reg (type, prefix);
4435
4436   return new_vect_var;
4437 }
4438
4439 /* Like vect_get_new_vect_var but return an SSA name.  */
4440
4441 tree
4442 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4443 {
4444   const char *prefix;
4445   tree new_vect_var;
4446
4447   switch (var_kind)
4448   {
4449   case vect_simple_var:
4450     prefix = "vect";
4451     break;
4452   case vect_scalar_var:
4453     prefix = "stmp";
4454     break;
4455   case vect_pointer_var:
4456     prefix = "vectp";
4457     break;
4458   default:
4459     gcc_unreachable ();
4460   }
4461
4462   if (name)
4463     {
4464       char* tmp = concat (prefix, "_", name, NULL);
4465       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4466       free (tmp);
4467     }
4468   else
4469     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4470
4471   return new_vect_var;
4472 }
4473
4474 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4475
4476 static void
4477 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4478 {
4479   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4480   int misalign = DR_MISALIGNMENT (dr);
4481   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4482     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4483   else
4484     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4485                             DR_TARGET_ALIGNMENT (dr), misalign);
4486 }
4487
4488 /* Function vect_create_addr_base_for_vector_ref.
4489
4490    Create an expression that computes the address of the first memory location
4491    that will be accessed for a data reference.
4492
4493    Input:
4494    STMT: The statement containing the data reference.
4495    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4496    OFFSET: Optional. If supplied, it is be added to the initial address.
4497    LOOP:    Specify relative to which loop-nest should the address be computed.
4498             For example, when the dataref is in an inner-loop nested in an
4499             outer-loop that is now being vectorized, LOOP can be either the
4500             outer-loop, or the inner-loop.  The first memory location accessed
4501             by the following dataref ('in' points to short):
4502
4503                 for (i=0; i<N; i++)
4504                    for (j=0; j<M; j++)
4505                      s += in[i+j]
4506
4507             is as follows:
4508             if LOOP=i_loop:     &in             (relative to i_loop)
4509             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4510    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4511             initial address.  Unlike OFFSET, which is number of elements to
4512             be added, BYTE_OFFSET is measured in bytes.
4513
4514    Output:
4515    1. Return an SSA_NAME whose value is the address of the memory location of
4516       the first vector of the data reference.
4517    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4518       these statement(s) which define the returned SSA_NAME.
4519
4520    FORNOW: We are only handling array accesses with step 1.  */
4521
4522 tree
4523 vect_create_addr_base_for_vector_ref (gimple *stmt,
4524                                       gimple_seq *new_stmt_list,
4525                                       tree offset,
4526                                       tree byte_offset)
4527 {
4528   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4529   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4530   const char *base_name;
4531   tree addr_base;
4532   tree dest;
4533   gimple_seq seq = NULL;
4534   tree vect_ptr_type;
4535   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4536   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4537   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4538
4539   tree data_ref_base = unshare_expr (drb->base_address);
4540   tree base_offset = unshare_expr (drb->offset);
4541   tree init = unshare_expr (drb->init);
4542
4543   if (loop_vinfo)
4544     base_name = get_name (data_ref_base);
4545   else
4546     {
4547       base_offset = ssize_int (0);
4548       init = ssize_int (0);
4549       base_name = get_name (DR_REF (dr));
4550     }
4551
4552   /* Create base_offset */
4553   base_offset = size_binop (PLUS_EXPR,
4554                             fold_convert (sizetype, base_offset),
4555                             fold_convert (sizetype, init));
4556
4557   if (offset)
4558     {
4559       offset = fold_build2 (MULT_EXPR, sizetype,
4560                             fold_convert (sizetype, offset), step);
4561       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4562                                  base_offset, offset);
4563     }
4564   if (byte_offset)
4565     {
4566       byte_offset = fold_convert (sizetype, byte_offset);
4567       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4568                                  base_offset, byte_offset);
4569     }
4570
4571   /* base + base_offset */
4572   if (loop_vinfo)
4573     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4574   else
4575     {
4576       addr_base = build1 (ADDR_EXPR,
4577                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4578                           unshare_expr (DR_REF (dr)));
4579     }
4580
4581   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4582   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4583   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4584   gimple_seq_add_seq (new_stmt_list, seq);
4585
4586   if (DR_PTR_INFO (dr)
4587       && TREE_CODE (addr_base) == SSA_NAME
4588       && !SSA_NAME_PTR_INFO (addr_base))
4589     {
4590       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4591       if (offset || byte_offset)
4592         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4593     }
4594
4595   if (dump_enabled_p ())
4596     {
4597       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4598       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4599       dump_printf (MSG_NOTE, "\n");
4600     }
4601
4602   return addr_base;
4603 }
4604
4605
4606 /* Function vect_create_data_ref_ptr.
4607
4608    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4609    location accessed in the loop by STMT, along with the def-use update
4610    chain to appropriately advance the pointer through the loop iterations.
4611    Also set aliasing information for the pointer.  This pointer is used by
4612    the callers to this function to create a memory reference expression for
4613    vector load/store access.
4614
4615    Input:
4616    1. STMT: a stmt that references memory. Expected to be of the form
4617          GIMPLE_ASSIGN <name, data-ref> or
4618          GIMPLE_ASSIGN <data-ref, name>.
4619    2. AGGR_TYPE: the type of the reference, which should be either a vector
4620         or an array.
4621    3. AT_LOOP: the loop where the vector memref is to be created.
4622    4. OFFSET (optional): an offset to be added to the initial address accessed
4623         by the data-ref in STMT.
4624    5. BSI: location where the new stmts are to be placed if there is no loop
4625    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4626         pointing to the initial address.
4627    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4628         to the initial address accessed by the data-ref in STMT.  This is
4629         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4630         in bytes.
4631    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4632         to the IV during each iteration of the loop.  NULL says to move
4633         by one copy of AGGR_TYPE up or down, depending on the step of the
4634         data reference.
4635
4636    Output:
4637    1. Declare a new ptr to vector_type, and have it point to the base of the
4638       data reference (initial addressed accessed by the data reference).
4639       For example, for vector of type V8HI, the following code is generated:
4640
4641       v8hi *ap;
4642       ap = (v8hi *)initial_address;
4643
4644       if OFFSET is not supplied:
4645          initial_address = &a[init];
4646       if OFFSET is supplied:
4647          initial_address = &a[init + OFFSET];
4648       if BYTE_OFFSET is supplied:
4649          initial_address = &a[init] + BYTE_OFFSET;
4650
4651       Return the initial_address in INITIAL_ADDRESS.
4652
4653    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4654       update the pointer in each iteration of the loop.
4655
4656       Return the increment stmt that updates the pointer in PTR_INCR.
4657
4658    3. Set INV_P to true if the access pattern of the data reference in the
4659       vectorized loop is invariant.  Set it to false otherwise.
4660
4661    4. Return the pointer.  */
4662
4663 tree
4664 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4665                           tree offset, tree *initial_address,
4666                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4667                           bool only_init, bool *inv_p, tree byte_offset,
4668                           tree iv_step)
4669 {
4670   const char *base_name;
4671   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4672   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4673   struct loop *loop = NULL;
4674   bool nested_in_vect_loop = false;
4675   struct loop *containing_loop = NULL;
4676   tree aggr_ptr_type;
4677   tree aggr_ptr;
4678   tree new_temp;
4679   gimple_seq new_stmt_list = NULL;
4680   edge pe = NULL;
4681   basic_block new_bb;
4682   tree aggr_ptr_init;
4683   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4684   tree aptr;
4685   gimple_stmt_iterator incr_gsi;
4686   bool insert_after;
4687   tree indx_before_incr, indx_after_incr;
4688   gimple *incr;
4689   tree step;
4690   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4691
4692   gcc_assert (iv_step != NULL_TREE
4693               || TREE_CODE (aggr_type) == ARRAY_TYPE
4694               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4695
4696   if (loop_vinfo)
4697     {
4698       loop = LOOP_VINFO_LOOP (loop_vinfo);
4699       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4700       containing_loop = (gimple_bb (stmt))->loop_father;
4701       pe = loop_preheader_edge (loop);
4702     }
4703   else
4704     {
4705       gcc_assert (bb_vinfo);
4706       only_init = true;
4707       *ptr_incr = NULL;
4708     }
4709
4710   /* Check the step (evolution) of the load in LOOP, and record
4711      whether it's invariant.  */
4712   step = vect_dr_behavior (dr)->step;
4713   if (integer_zerop (step))
4714     *inv_p = true;
4715   else
4716     *inv_p = false;
4717
4718   /* Create an expression for the first address accessed by this load
4719      in LOOP.  */
4720   base_name = get_name (DR_BASE_ADDRESS (dr));
4721
4722   if (dump_enabled_p ())
4723     {
4724       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4725       dump_printf_loc (MSG_NOTE, vect_location,
4726                        "create %s-pointer variable to type: ",
4727                        get_tree_code_name (TREE_CODE (aggr_type)));
4728       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4729       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4730         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4731       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4732         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4733       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4734         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4735       else
4736         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4737       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4738       dump_printf (MSG_NOTE, "\n");
4739     }
4740
4741   /* (1) Create the new aggregate-pointer variable.
4742      Vector and array types inherit the alias set of their component
4743      type by default so we need to use a ref-all pointer if the data
4744      reference does not conflict with the created aggregated data
4745      reference because it is not addressable.  */
4746   bool need_ref_all = false;
4747   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4748                               get_alias_set (DR_REF (dr))))
4749     need_ref_all = true;
4750   /* Likewise for any of the data references in the stmt group.  */
4751   else if (DR_GROUP_SIZE (stmt_info) > 1)
4752     {
4753       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4754       do
4755         {
4756           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4757           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4758           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4759                                       get_alias_set (DR_REF (sdr))))
4760             {
4761               need_ref_all = true;
4762               break;
4763             }
4764           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4765         }
4766       while (orig_stmt);
4767     }
4768   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4769                                                need_ref_all);
4770   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4771
4772
4773   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4774      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4775      def-use update cycles for the pointer: one relative to the outer-loop
4776      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4777      to the inner-loop (which is the inner-most loop containing the dataref),
4778      and this is done be step (5) below.
4779
4780      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4781      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4782      redundant.  Steps (3),(4) create the following:
4783
4784         vp0 = &base_addr;
4785         LOOP:   vp1 = phi(vp0,vp2)
4786                 ...
4787                 ...
4788                 vp2 = vp1 + step
4789                 goto LOOP
4790
4791      If there is an inner-loop nested in loop, then step (5) will also be
4792      applied, and an additional update in the inner-loop will be created:
4793
4794         vp0 = &base_addr;
4795         LOOP:   vp1 = phi(vp0,vp2)
4796                 ...
4797         inner:     vp3 = phi(vp1,vp4)
4798                    vp4 = vp3 + inner_step
4799                    if () goto inner
4800                 ...
4801                 vp2 = vp1 + step
4802                 if () goto LOOP   */
4803
4804   /* (2) Calculate the initial address of the aggregate-pointer, and set
4805      the aggregate-pointer to point to it before the loop.  */
4806
4807   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4808
4809   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4810                                                    offset, byte_offset);
4811   if (new_stmt_list)
4812     {
4813       if (pe)
4814         {
4815           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4816           gcc_assert (!new_bb);
4817         }
4818       else
4819         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4820     }
4821
4822   *initial_address = new_temp;
4823   aggr_ptr_init = new_temp;
4824
4825   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4826      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4827      inner-loop nested in LOOP (during outer-loop vectorization).  */
4828
4829   /* No update in loop is required.  */
4830   if (only_init && (!loop_vinfo || at_loop == loop))
4831     aptr = aggr_ptr_init;
4832   else
4833     {
4834       if (iv_step == NULL_TREE)
4835         {
4836           /* The step of the aggregate pointer is the type size.  */
4837           iv_step = TYPE_SIZE_UNIT (aggr_type);
4838           /* One exception to the above is when the scalar step of the load in
4839              LOOP is zero. In this case the step here is also zero.  */
4840           if (*inv_p)
4841             iv_step = size_zero_node;
4842           else if (tree_int_cst_sgn (step) == -1)
4843             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4844         }
4845
4846       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4847
4848       create_iv (aggr_ptr_init,
4849                  fold_convert (aggr_ptr_type, iv_step),
4850                  aggr_ptr, loop, &incr_gsi, insert_after,
4851                  &indx_before_incr, &indx_after_incr);
4852       incr = gsi_stmt (incr_gsi);
4853       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4854
4855       /* Copy the points-to information if it exists. */
4856       if (DR_PTR_INFO (dr))
4857         {
4858           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4859           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4860         }
4861       if (ptr_incr)
4862         *ptr_incr = incr;
4863
4864       aptr = indx_before_incr;
4865     }
4866
4867   if (!nested_in_vect_loop || only_init)
4868     return aptr;
4869
4870
4871   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4872      nested in LOOP, if exists.  */
4873
4874   gcc_assert (nested_in_vect_loop);
4875   if (!only_init)
4876     {
4877       standard_iv_increment_position (containing_loop, &incr_gsi,
4878                                       &insert_after);
4879       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4880                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4881                  &indx_after_incr);
4882       incr = gsi_stmt (incr_gsi);
4883       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4884
4885       /* Copy the points-to information if it exists. */
4886       if (DR_PTR_INFO (dr))
4887         {
4888           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4889           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4890         }
4891       if (ptr_incr)
4892         *ptr_incr = incr;
4893
4894       return indx_before_incr;
4895     }
4896   else
4897     gcc_unreachable ();
4898 }
4899
4900
4901 /* Function bump_vector_ptr
4902
4903    Increment a pointer (to a vector type) by vector-size. If requested,
4904    i.e. if PTR-INCR is given, then also connect the new increment stmt
4905    to the existing def-use update-chain of the pointer, by modifying
4906    the PTR_INCR as illustrated below:
4907
4908    The pointer def-use update-chain before this function:
4909                         DATAREF_PTR = phi (p_0, p_2)
4910                         ....
4911         PTR_INCR:       p_2 = DATAREF_PTR + step
4912
4913    The pointer def-use update-chain after this function:
4914                         DATAREF_PTR = phi (p_0, p_2)
4915                         ....
4916                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4917                         ....
4918         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4919
4920    Input:
4921    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4922                  in the loop.
4923    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4924               the loop.  The increment amount across iterations is expected
4925               to be vector_size.
4926    BSI - location where the new update stmt is to be placed.
4927    STMT - the original scalar memory-access stmt that is being vectorized.
4928    BUMP - optional. The offset by which to bump the pointer. If not given,
4929           the offset is assumed to be vector_size.
4930
4931    Output: Return NEW_DATAREF_PTR as illustrated above.
4932
4933 */
4934
4935 tree
4936 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4937                  gimple *stmt, tree bump)
4938 {
4939   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4940   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4941   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4942   tree update = TYPE_SIZE_UNIT (vectype);
4943   gassign *incr_stmt;
4944   ssa_op_iter iter;
4945   use_operand_p use_p;
4946   tree new_dataref_ptr;
4947
4948   if (bump)
4949     update = bump;
4950
4951   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4952     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4953   else
4954     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4955   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4956                                    dataref_ptr, update);
4957   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4958
4959   /* Copy the points-to information if it exists. */
4960   if (DR_PTR_INFO (dr))
4961     {
4962       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4963       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4964     }
4965
4966   if (!ptr_incr)
4967     return new_dataref_ptr;
4968
4969   /* Update the vector-pointer's cross-iteration increment.  */
4970   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4971     {
4972       tree use = USE_FROM_PTR (use_p);
4973
4974       if (use == dataref_ptr)
4975         SET_USE (use_p, new_dataref_ptr);
4976       else
4977         gcc_assert (operand_equal_p (use, update, 0));
4978     }
4979
4980   return new_dataref_ptr;
4981 }
4982
4983
4984 /* Copy memory reference info such as base/clique from the SRC reference
4985    to the DEST MEM_REF.  */
4986
4987 void
4988 vect_copy_ref_info (tree dest, tree src)
4989 {
4990   if (TREE_CODE (dest) != MEM_REF)
4991     return;
4992
4993   tree src_base = src;
4994   while (handled_component_p (src_base))
4995     src_base = TREE_OPERAND (src_base, 0);
4996   if (TREE_CODE (src_base) != MEM_REF
4997       && TREE_CODE (src_base) != TARGET_MEM_REF)
4998     return;
4999
5000   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5001   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5002 }
5003
5004
5005 /* Function vect_create_destination_var.
5006
5007    Create a new temporary of type VECTYPE.  */
5008
5009 tree
5010 vect_create_destination_var (tree scalar_dest, tree vectype)
5011 {
5012   tree vec_dest;
5013   const char *name;
5014   char *new_name;
5015   tree type;
5016   enum vect_var_kind kind;
5017
5018   kind = vectype
5019     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5020     ? vect_mask_var
5021     : vect_simple_var
5022     : vect_scalar_var;
5023   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5024
5025   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5026
5027   name = get_name (scalar_dest);
5028   if (name)
5029     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5030   else
5031     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5032   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5033   free (new_name);
5034
5035   return vec_dest;
5036 }
5037
5038 /* Function vect_grouped_store_supported.
5039
5040    Returns TRUE if interleave high and interleave low permutations
5041    are supported, and FALSE otherwise.  */
5042
5043 bool
5044 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 {
5046   machine_mode mode = TYPE_MODE (vectype);
5047
5048   /* vect_permute_store_chain requires the group size to be equal to 3 or
5049      be a power of two.  */
5050   if (count != 3 && exact_log2 (count) == -1)
5051     {
5052       if (dump_enabled_p ())
5053         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5054                          "the size of the group of accesses"
5055                          " is not a power of 2 or not eqaul to 3\n");
5056       return false;
5057     }
5058
5059   /* Check that the permutation is supported.  */
5060   if (VECTOR_MODE_P (mode))
5061     {
5062       unsigned int i;
5063       if (count == 3)
5064         {
5065           unsigned int j0 = 0, j1 = 0, j2 = 0;
5066           unsigned int i, j;
5067
5068           unsigned int nelt;
5069           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5070             {
5071               if (dump_enabled_p ())
5072                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5073                                  "cannot handle groups of 3 stores for"
5074                                  " variable-length vectors\n");
5075               return false;
5076             }
5077
5078           vec_perm_builder sel (nelt, nelt, 1);
5079           sel.quick_grow (nelt);
5080           vec_perm_indices indices;
5081           for (j = 0; j < 3; j++)
5082             {
5083               int nelt0 = ((3 - j) * nelt) % 3;
5084               int nelt1 = ((3 - j) * nelt + 1) % 3;
5085               int nelt2 = ((3 - j) * nelt + 2) % 3;
5086               for (i = 0; i < nelt; i++)
5087                 {
5088                   if (3 * i + nelt0 < nelt)
5089                     sel[3 * i + nelt0] = j0++;
5090                   if (3 * i + nelt1 < nelt)
5091                     sel[3 * i + nelt1] = nelt + j1++;
5092                   if (3 * i + nelt2 < nelt)
5093                     sel[3 * i + nelt2] = 0;
5094                 }
5095               indices.new_vector (sel, 2, nelt);
5096               if (!can_vec_perm_const_p (mode, indices))
5097                 {
5098                   if (dump_enabled_p ())
5099                     dump_printf (MSG_MISSED_OPTIMIZATION,
5100                                  "permutation op not supported by target.\n");
5101                   return false;
5102                 }
5103
5104               for (i = 0; i < nelt; i++)
5105                 {
5106                   if (3 * i + nelt0 < nelt)
5107                     sel[3 * i + nelt0] = 3 * i + nelt0;
5108                   if (3 * i + nelt1 < nelt)
5109                     sel[3 * i + nelt1] = 3 * i + nelt1;
5110                   if (3 * i + nelt2 < nelt)
5111                     sel[3 * i + nelt2] = nelt + j2++;
5112                 }
5113               indices.new_vector (sel, 2, nelt);
5114               if (!can_vec_perm_const_p (mode, indices))
5115                 {
5116                   if (dump_enabled_p ())
5117                     dump_printf (MSG_MISSED_OPTIMIZATION,
5118                                  "permutation op not supported by target.\n");
5119                   return false;
5120                 }
5121             }
5122           return true;
5123         }
5124       else
5125         {
5126           /* If length is not equal to 3 then only power of 2 is supported.  */
5127           gcc_assert (pow2p_hwi (count));
5128           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5129
5130           /* The encoding has 2 interleaved stepped patterns.  */
5131           vec_perm_builder sel (nelt, 2, 3);
5132           sel.quick_grow (6);
5133           for (i = 0; i < 3; i++)
5134             {
5135               sel[i * 2] = i;
5136               sel[i * 2 + 1] = i + nelt;
5137             }
5138           vec_perm_indices indices (sel, 2, nelt);
5139           if (can_vec_perm_const_p (mode, indices))
5140             {
5141               for (i = 0; i < 6; i++)
5142                 sel[i] += exact_div (nelt, 2);
5143               indices.new_vector (sel, 2, nelt);
5144               if (can_vec_perm_const_p (mode, indices))
5145                 return true;
5146             }
5147         }
5148     }
5149
5150   if (dump_enabled_p ())
5151     dump_printf (MSG_MISSED_OPTIMIZATION,
5152                  "permutaion op not supported by target.\n");
5153   return false;
5154 }
5155
5156
5157 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5158    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5159
5160 bool
5161 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5162                             bool masked_p)
5163 {
5164   if (masked_p)
5165     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5166                                          vec_mask_store_lanes_optab,
5167                                          vectype, count);
5168   else
5169     return vect_lanes_optab_supported_p ("vec_store_lanes",
5170                                          vec_store_lanes_optab,
5171                                          vectype, count);
5172 }
5173
5174
5175 /* Function vect_permute_store_chain.
5176
5177    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5178    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5179    the data correctly for the stores.  Return the final references for stores
5180    in RESULT_CHAIN.
5181
5182    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5183    The input is 4 vectors each containing 8 elements.  We assign a number to
5184    each element, the input sequence is:
5185
5186    1st vec:   0  1  2  3  4  5  6  7
5187    2nd vec:   8  9 10 11 12 13 14 15
5188    3rd vec:  16 17 18 19 20 21 22 23
5189    4th vec:  24 25 26 27 28 29 30 31
5190
5191    The output sequence should be:
5192
5193    1st vec:  0  8 16 24  1  9 17 25
5194    2nd vec:  2 10 18 26  3 11 19 27
5195    3rd vec:  4 12 20 28  5 13 21 30
5196    4th vec:  6 14 22 30  7 15 23 31
5197
5198    i.e., we interleave the contents of the four vectors in their order.
5199
5200    We use interleave_high/low instructions to create such output.  The input of
5201    each interleave_high/low operation is two vectors:
5202    1st vec    2nd vec
5203    0 1 2 3    4 5 6 7
5204    the even elements of the result vector are obtained left-to-right from the
5205    high/low elements of the first vector.  The odd elements of the result are
5206    obtained left-to-right from the high/low elements of the second vector.
5207    The output of interleave_high will be:   0 4 1 5
5208    and of interleave_low:                   2 6 3 7
5209
5210
5211    The permutation is done in log LENGTH stages.  In each stage interleave_high
5212    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5213    where the first argument is taken from the first half of DR_CHAIN and the
5214    second argument from it's second half.
5215    In our example,
5216
5217    I1: interleave_high (1st vec, 3rd vec)
5218    I2: interleave_low (1st vec, 3rd vec)
5219    I3: interleave_high (2nd vec, 4th vec)
5220    I4: interleave_low (2nd vec, 4th vec)
5221
5222    The output for the first stage is:
5223
5224    I1:  0 16  1 17  2 18  3 19
5225    I2:  4 20  5 21  6 22  7 23
5226    I3:  8 24  9 25 10 26 11 27
5227    I4: 12 28 13 29 14 30 15 31
5228
5229    The output of the second stage, i.e. the final result is:
5230
5231    I1:  0  8 16 24  1  9 17 25
5232    I2:  2 10 18 26  3 11 19 27
5233    I3:  4 12 20 28  5 13 21 30
5234    I4:  6 14 22 30  7 15 23 31.  */
5235
5236 void
5237 vect_permute_store_chain (vec<tree> dr_chain,
5238                           unsigned int length,
5239                           gimple *stmt,
5240                           gimple_stmt_iterator *gsi,
5241                           vec<tree> *result_chain)
5242 {
5243   tree vect1, vect2, high, low;
5244   gimple *perm_stmt;
5245   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5246   tree perm_mask_low, perm_mask_high;
5247   tree data_ref;
5248   tree perm3_mask_low, perm3_mask_high;
5249   unsigned int i, j, n, log_length = exact_log2 (length);
5250
5251   result_chain->quick_grow (length);
5252   memcpy (result_chain->address (), dr_chain.address (),
5253           length * sizeof (tree));
5254
5255   if (length == 3)
5256     {
5257       /* vect_grouped_store_supported ensures that this is constant.  */
5258       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5259       unsigned int j0 = 0, j1 = 0, j2 = 0;
5260
5261       vec_perm_builder sel (nelt, nelt, 1);
5262       sel.quick_grow (nelt);
5263       vec_perm_indices indices;
5264       for (j = 0; j < 3; j++)
5265         {
5266           int nelt0 = ((3 - j) * nelt) % 3;
5267           int nelt1 = ((3 - j) * nelt + 1) % 3;
5268           int nelt2 = ((3 - j) * nelt + 2) % 3;
5269
5270           for (i = 0; i < nelt; i++)
5271             {
5272               if (3 * i + nelt0 < nelt)
5273                 sel[3 * i + nelt0] = j0++;
5274               if (3 * i + nelt1 < nelt)
5275                 sel[3 * i + nelt1] = nelt + j1++;
5276               if (3 * i + nelt2 < nelt)
5277                 sel[3 * i + nelt2] = 0;
5278             }
5279           indices.new_vector (sel, 2, nelt);
5280           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5281
5282           for (i = 0; i < nelt; i++)
5283             {
5284               if (3 * i + nelt0 < nelt)
5285                 sel[3 * i + nelt0] = 3 * i + nelt0;
5286               if (3 * i + nelt1 < nelt)
5287                 sel[3 * i + nelt1] = 3 * i + nelt1;
5288               if (3 * i + nelt2 < nelt)
5289                 sel[3 * i + nelt2] = nelt + j2++;
5290             }
5291           indices.new_vector (sel, 2, nelt);
5292           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5293
5294           vect1 = dr_chain[0];
5295           vect2 = dr_chain[1];
5296
5297           /* Create interleaving stmt:
5298              low = VEC_PERM_EXPR <vect1, vect2,
5299                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5300                                    j + 2, nelt + j + 2, *, ...}>  */
5301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5303                                            vect2, perm3_mask_low);
5304           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305
5306           vect1 = data_ref;
5307           vect2 = dr_chain[2];
5308           /* Create interleaving stmt:
5309              low = VEC_PERM_EXPR <vect1, vect2,
5310                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5311                                    6, 7, nelt + j + 2, ...}>  */
5312           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5313           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5314                                            vect2, perm3_mask_high);
5315           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5316           (*result_chain)[j] = data_ref;
5317         }
5318     }
5319   else
5320     {
5321       /* If length is not equal to 3 then only power of 2 is supported.  */
5322       gcc_assert (pow2p_hwi (length));
5323
5324       /* The encoding has 2 interleaved stepped patterns.  */
5325       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5326       vec_perm_builder sel (nelt, 2, 3);
5327       sel.quick_grow (6);
5328       for (i = 0; i < 3; i++)
5329         {
5330           sel[i * 2] = i;
5331           sel[i * 2 + 1] = i + nelt;
5332         }
5333         vec_perm_indices indices (sel, 2, nelt);
5334         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5335
5336         for (i = 0; i < 6; i++)
5337           sel[i] += exact_div (nelt, 2);
5338         indices.new_vector (sel, 2, nelt);
5339         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5340
5341         for (i = 0, n = log_length; i < n; i++)
5342           {
5343             for (j = 0; j < length/2; j++)
5344               {
5345                 vect1 = dr_chain[j];
5346                 vect2 = dr_chain[j+length/2];
5347
5348                 /* Create interleaving stmt:
5349                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5350                                                         ...}>  */
5351                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5352                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5353                                                  vect2, perm_mask_high);
5354                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5355                 (*result_chain)[2*j] = high;
5356
5357                 /* Create interleaving stmt:
5358                    low = VEC_PERM_EXPR <vect1, vect2,
5359                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5360                                          ...}>  */
5361                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5362                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5363                                                  vect2, perm_mask_low);
5364                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5365                 (*result_chain)[2*j+1] = low;
5366               }
5367             memcpy (dr_chain.address (), result_chain->address (),
5368                     length * sizeof (tree));
5369           }
5370     }
5371 }
5372
5373 /* Function vect_setup_realignment
5374
5375    This function is called when vectorizing an unaligned load using
5376    the dr_explicit_realign[_optimized] scheme.
5377    This function generates the following code at the loop prolog:
5378
5379       p = initial_addr;
5380    x  msq_init = *(floor(p));   # prolog load
5381       realignment_token = call target_builtin;
5382     loop:
5383    x  msq = phi (msq_init, ---)
5384
5385    The stmts marked with x are generated only for the case of
5386    dr_explicit_realign_optimized.
5387
5388    The code above sets up a new (vector) pointer, pointing to the first
5389    location accessed by STMT, and a "floor-aligned" load using that pointer.
5390    It also generates code to compute the "realignment-token" (if the relevant
5391    target hook was defined), and creates a phi-node at the loop-header bb
5392    whose arguments are the result of the prolog-load (created by this
5393    function) and the result of a load that takes place in the loop (to be
5394    created by the caller to this function).
5395
5396    For the case of dr_explicit_realign_optimized:
5397    The caller to this function uses the phi-result (msq) to create the
5398    realignment code inside the loop, and sets up the missing phi argument,
5399    as follows:
5400     loop:
5401       msq = phi (msq_init, lsq)
5402       lsq = *(floor(p'));        # load in loop
5403       result = realign_load (msq, lsq, realignment_token);
5404
5405    For the case of dr_explicit_realign:
5406     loop:
5407       msq = *(floor(p));        # load in loop
5408       p' = p + (VS-1);
5409       lsq = *(floor(p'));       # load in loop
5410       result = realign_load (msq, lsq, realignment_token);
5411
5412    Input:
5413    STMT - (scalar) load stmt to be vectorized. This load accesses
5414           a memory location that may be unaligned.
5415    BSI - place where new code is to be inserted.
5416    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5417                               is used.
5418
5419    Output:
5420    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5421                        target hook, if defined.
5422    Return value - the result of the loop-header phi node.  */
5423
5424 tree
5425 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5426                         tree *realignment_token,
5427                         enum dr_alignment_support alignment_support_scheme,
5428                         tree init_addr,
5429                         struct loop **at_loop)
5430 {
5431   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5432   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5433   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5434   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5435   struct loop *loop = NULL;
5436   edge pe = NULL;
5437   tree scalar_dest = gimple_assign_lhs (stmt);
5438   tree vec_dest;
5439   gimple *inc;
5440   tree ptr;
5441   tree data_ref;
5442   basic_block new_bb;
5443   tree msq_init = NULL_TREE;
5444   tree new_temp;
5445   gphi *phi_stmt;
5446   tree msq = NULL_TREE;
5447   gimple_seq stmts = NULL;
5448   bool inv_p;
5449   bool compute_in_loop = false;
5450   bool nested_in_vect_loop = false;
5451   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5452   struct loop *loop_for_initial_load = NULL;
5453
5454   if (loop_vinfo)
5455     {
5456       loop = LOOP_VINFO_LOOP (loop_vinfo);
5457       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5458     }
5459
5460   gcc_assert (alignment_support_scheme == dr_explicit_realign
5461               || alignment_support_scheme == dr_explicit_realign_optimized);
5462
5463   /* We need to generate three things:
5464      1. the misalignment computation
5465      2. the extra vector load (for the optimized realignment scheme).
5466      3. the phi node for the two vectors from which the realignment is
5467       done (for the optimized realignment scheme).  */
5468
5469   /* 1. Determine where to generate the misalignment computation.
5470
5471      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5472      calculation will be generated by this function, outside the loop (in the
5473      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5474      caller, inside the loop.
5475
5476      Background: If the misalignment remains fixed throughout the iterations of
5477      the loop, then both realignment schemes are applicable, and also the
5478      misalignment computation can be done outside LOOP.  This is because we are
5479      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5480      are a multiple of VS (the Vector Size), and therefore the misalignment in
5481      different vectorized LOOP iterations is always the same.
5482      The problem arises only if the memory access is in an inner-loop nested
5483      inside LOOP, which is now being vectorized using outer-loop vectorization.
5484      This is the only case when the misalignment of the memory access may not
5485      remain fixed throughout the iterations of the inner-loop (as explained in
5486      detail in vect_supportable_dr_alignment).  In this case, not only is the
5487      optimized realignment scheme not applicable, but also the misalignment
5488      computation (and generation of the realignment token that is passed to
5489      REALIGN_LOAD) have to be done inside the loop.
5490
5491      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5492      or not, which in turn determines if the misalignment is computed inside
5493      the inner-loop, or outside LOOP.  */
5494
5495   if (init_addr != NULL_TREE || !loop_vinfo)
5496     {
5497       compute_in_loop = true;
5498       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5499     }
5500
5501
5502   /* 2. Determine where to generate the extra vector load.
5503
5504      For the optimized realignment scheme, instead of generating two vector
5505      loads in each iteration, we generate a single extra vector load in the
5506      preheader of the loop, and in each iteration reuse the result of the
5507      vector load from the previous iteration.  In case the memory access is in
5508      an inner-loop nested inside LOOP, which is now being vectorized using
5509      outer-loop vectorization, we need to determine whether this initial vector
5510      load should be generated at the preheader of the inner-loop, or can be
5511      generated at the preheader of LOOP.  If the memory access has no evolution
5512      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5513      to be generated inside LOOP (in the preheader of the inner-loop).  */
5514
5515   if (nested_in_vect_loop)
5516     {
5517       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5518       bool invariant_in_outerloop =
5519             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5520       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5521     }
5522   else
5523     loop_for_initial_load = loop;
5524   if (at_loop)
5525     *at_loop = loop_for_initial_load;
5526
5527   if (loop_for_initial_load)
5528     pe = loop_preheader_edge (loop_for_initial_load);
5529
5530   /* 3. For the case of the optimized realignment, create the first vector
5531       load at the loop preheader.  */
5532
5533   if (alignment_support_scheme == dr_explicit_realign_optimized)
5534     {
5535       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5536       gassign *new_stmt;
5537
5538       gcc_assert (!compute_in_loop);
5539       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5540       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5541                                       NULL_TREE, &init_addr, NULL, &inc,
5542                                       true, &inv_p);
5543       if (TREE_CODE (ptr) == SSA_NAME)
5544         new_temp = copy_ssa_name (ptr);
5545       else
5546         new_temp = make_ssa_name (TREE_TYPE (ptr));
5547       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5548       new_stmt = gimple_build_assign
5549                    (new_temp, BIT_AND_EXPR, ptr,
5550                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5551       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5552       gcc_assert (!new_bb);
5553       data_ref
5554         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5555                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5556       vect_copy_ref_info (data_ref, DR_REF (dr));
5557       new_stmt = gimple_build_assign (vec_dest, data_ref);
5558       new_temp = make_ssa_name (vec_dest, new_stmt);
5559       gimple_assign_set_lhs (new_stmt, new_temp);
5560       if (pe)
5561         {
5562           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5563           gcc_assert (!new_bb);
5564         }
5565       else
5566          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5567
5568       msq_init = gimple_assign_lhs (new_stmt);
5569     }
5570
5571   /* 4. Create realignment token using a target builtin, if available.
5572       It is done either inside the containing loop, or before LOOP (as
5573       determined above).  */
5574
5575   if (targetm.vectorize.builtin_mask_for_load)
5576     {
5577       gcall *new_stmt;
5578       tree builtin_decl;
5579
5580       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5581       if (!init_addr)
5582         {
5583           /* Generate the INIT_ADDR computation outside LOOP.  */
5584           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5585                                                             NULL_TREE);
5586           if (loop)
5587             {
5588               pe = loop_preheader_edge (loop);
5589               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5590               gcc_assert (!new_bb);
5591             }
5592           else
5593              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5594         }
5595
5596       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5597       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5598       vec_dest =
5599         vect_create_destination_var (scalar_dest,
5600                                      gimple_call_return_type (new_stmt));
5601       new_temp = make_ssa_name (vec_dest, new_stmt);
5602       gimple_call_set_lhs (new_stmt, new_temp);
5603
5604       if (compute_in_loop)
5605         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5606       else
5607         {
5608           /* Generate the misalignment computation outside LOOP.  */
5609           pe = loop_preheader_edge (loop);
5610           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5611           gcc_assert (!new_bb);
5612         }
5613
5614       *realignment_token = gimple_call_lhs (new_stmt);
5615
5616       /* The result of the CALL_EXPR to this builtin is determined from
5617          the value of the parameter and no global variables are touched
5618          which makes the builtin a "const" function.  Requiring the
5619          builtin to have the "const" attribute makes it unnecessary
5620          to call mark_call_clobbered.  */
5621       gcc_assert (TREE_READONLY (builtin_decl));
5622     }
5623
5624   if (alignment_support_scheme == dr_explicit_realign)
5625     return msq;
5626
5627   gcc_assert (!compute_in_loop);
5628   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5629
5630
5631   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5632
5633   pe = loop_preheader_edge (containing_loop);
5634   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5635   msq = make_ssa_name (vec_dest);
5636   phi_stmt = create_phi_node (msq, containing_loop->header);
5637   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5638
5639   return msq;
5640 }
5641
5642
5643 /* Function vect_grouped_load_supported.
5644
5645    COUNT is the size of the load group (the number of statements plus the
5646    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5647    only one statement, with a gap of COUNT - 1.
5648
5649    Returns true if a suitable permute exists.  */
5650
5651 bool
5652 vect_grouped_load_supported (tree vectype, bool single_element_p,
5653                              unsigned HOST_WIDE_INT count)
5654 {
5655   machine_mode mode = TYPE_MODE (vectype);
5656
5657   /* If this is single-element interleaving with an element distance
5658      that leaves unused vector loads around punt - we at least create
5659      very sub-optimal code in that case (and blow up memory,
5660      see PR65518).  */
5661   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5662     {
5663       if (dump_enabled_p ())
5664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665                          "single-element interleaving not supported "
5666                          "for not adjacent vector loads\n");
5667       return false;
5668     }
5669
5670   /* vect_permute_load_chain requires the group size to be equal to 3 or
5671      be a power of two.  */
5672   if (count != 3 && exact_log2 (count) == -1)
5673     {
5674       if (dump_enabled_p ())
5675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5676                          "the size of the group of accesses"
5677                          " is not a power of 2 or not equal to 3\n");
5678       return false;
5679     }
5680
5681   /* Check that the permutation is supported.  */
5682   if (VECTOR_MODE_P (mode))
5683     {
5684       unsigned int i, j;
5685       if (count == 3)
5686         {
5687           unsigned int nelt;
5688           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5689             {
5690               if (dump_enabled_p ())
5691                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5692                                  "cannot handle groups of 3 loads for"
5693                                  " variable-length vectors\n");
5694               return false;
5695             }
5696
5697           vec_perm_builder sel (nelt, nelt, 1);
5698           sel.quick_grow (nelt);
5699           vec_perm_indices indices;
5700           unsigned int k;
5701           for (k = 0; k < 3; k++)
5702             {
5703               for (i = 0; i < nelt; i++)
5704                 if (3 * i + k < 2 * nelt)
5705                   sel[i] = 3 * i + k;
5706                 else
5707                   sel[i] = 0;
5708               indices.new_vector (sel, 2, nelt);
5709               if (!can_vec_perm_const_p (mode, indices))
5710                 {
5711                   if (dump_enabled_p ())
5712                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713                                      "shuffle of 3 loads is not supported by"
5714                                      " target\n");
5715                   return false;
5716                 }
5717               for (i = 0, j = 0; i < nelt; i++)
5718                 if (3 * i + k < 2 * nelt)
5719                   sel[i] = i;
5720                 else
5721                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5722               indices.new_vector (sel, 2, nelt);
5723               if (!can_vec_perm_const_p (mode, indices))
5724                 {
5725                   if (dump_enabled_p ())
5726                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727                                      "shuffle of 3 loads is not supported by"
5728                                      " target\n");
5729                   return false;
5730                 }
5731             }
5732           return true;
5733         }
5734       else
5735         {
5736           /* If length is not equal to 3 then only power of 2 is supported.  */
5737           gcc_assert (pow2p_hwi (count));
5738           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5739
5740           /* The encoding has a single stepped pattern.  */
5741           vec_perm_builder sel (nelt, 1, 3);
5742           sel.quick_grow (3);
5743           for (i = 0; i < 3; i++)
5744             sel[i] = i * 2;
5745           vec_perm_indices indices (sel, 2, nelt);
5746           if (can_vec_perm_const_p (mode, indices))
5747             {
5748               for (i = 0; i < 3; i++)
5749                 sel[i] = i * 2 + 1;
5750               indices.new_vector (sel, 2, nelt);
5751               if (can_vec_perm_const_p (mode, indices))
5752                 return true;
5753             }
5754         }
5755     }
5756
5757   if (dump_enabled_p ())
5758     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5759                      "extract even/odd not supported by target\n");
5760   return false;
5761 }
5762
5763 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5764    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5765
5766 bool
5767 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5768                            bool masked_p)
5769 {
5770   if (masked_p)
5771     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5772                                          vec_mask_load_lanes_optab,
5773                                          vectype, count);
5774   else
5775     return vect_lanes_optab_supported_p ("vec_load_lanes",
5776                                          vec_load_lanes_optab,
5777                                          vectype, count);
5778 }
5779
5780 /* Function vect_permute_load_chain.
5781
5782    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5783    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5784    the input data correctly.  Return the final references for loads in
5785    RESULT_CHAIN.
5786
5787    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5788    The input is 4 vectors each containing 8 elements. We assign a number to each
5789    element, the input sequence is:
5790
5791    1st vec:   0  1  2  3  4  5  6  7
5792    2nd vec:   8  9 10 11 12 13 14 15
5793    3rd vec:  16 17 18 19 20 21 22 23
5794    4th vec:  24 25 26 27 28 29 30 31
5795
5796    The output sequence should be:
5797
5798    1st vec:  0 4  8 12 16 20 24 28
5799    2nd vec:  1 5  9 13 17 21 25 29
5800    3rd vec:  2 6 10 14 18 22 26 30
5801    4th vec:  3 7 11 15 19 23 27 31
5802
5803    i.e., the first output vector should contain the first elements of each
5804    interleaving group, etc.
5805
5806    We use extract_even/odd instructions to create such output.  The input of
5807    each extract_even/odd operation is two vectors
5808    1st vec    2nd vec
5809    0 1 2 3    4 5 6 7
5810
5811    and the output is the vector of extracted even/odd elements.  The output of
5812    extract_even will be:   0 2 4 6
5813    and of extract_odd:     1 3 5 7
5814
5815
5816    The permutation is done in log LENGTH stages.  In each stage extract_even
5817    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5818    their order.  In our example,
5819
5820    E1: extract_even (1st vec, 2nd vec)
5821    E2: extract_odd (1st vec, 2nd vec)
5822    E3: extract_even (3rd vec, 4th vec)
5823    E4: extract_odd (3rd vec, 4th vec)
5824
5825    The output for the first stage will be:
5826
5827    E1:  0  2  4  6  8 10 12 14
5828    E2:  1  3  5  7  9 11 13 15
5829    E3: 16 18 20 22 24 26 28 30
5830    E4: 17 19 21 23 25 27 29 31
5831
5832    In order to proceed and create the correct sequence for the next stage (or
5833    for the correct output, if the second stage is the last one, as in our
5834    example), we first put the output of extract_even operation and then the
5835    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5836    The input for the second stage is:
5837
5838    1st vec (E1):  0  2  4  6  8 10 12 14
5839    2nd vec (E3): 16 18 20 22 24 26 28 30
5840    3rd vec (E2):  1  3  5  7  9 11 13 15
5841    4th vec (E4): 17 19 21 23 25 27 29 31
5842
5843    The output of the second stage:
5844
5845    E1: 0 4  8 12 16 20 24 28
5846    E2: 2 6 10 14 18 22 26 30
5847    E3: 1 5  9 13 17 21 25 29
5848    E4: 3 7 11 15 19 23 27 31
5849
5850    And RESULT_CHAIN after reordering:
5851
5852    1st vec (E1):  0 4  8 12 16 20 24 28
5853    2nd vec (E3):  1 5  9 13 17 21 25 29
5854    3rd vec (E2):  2 6 10 14 18 22 26 30
5855    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5856
5857 static void
5858 vect_permute_load_chain (vec<tree> dr_chain,
5859                          unsigned int length,
5860                          gimple *stmt,
5861                          gimple_stmt_iterator *gsi,
5862                          vec<tree> *result_chain)
5863 {
5864   tree data_ref, first_vect, second_vect;
5865   tree perm_mask_even, perm_mask_odd;
5866   tree perm3_mask_low, perm3_mask_high;
5867   gimple *perm_stmt;
5868   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5869   unsigned int i, j, log_length = exact_log2 (length);
5870
5871   result_chain->quick_grow (length);
5872   memcpy (result_chain->address (), dr_chain.address (),
5873           length * sizeof (tree));
5874
5875   if (length == 3)
5876     {
5877       /* vect_grouped_load_supported ensures that this is constant.  */
5878       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5879       unsigned int k;
5880
5881       vec_perm_builder sel (nelt, nelt, 1);
5882       sel.quick_grow (nelt);
5883       vec_perm_indices indices;
5884       for (k = 0; k < 3; k++)
5885         {
5886           for (i = 0; i < nelt; i++)
5887             if (3 * i + k < 2 * nelt)
5888               sel[i] = 3 * i + k;
5889             else
5890               sel[i] = 0;
5891           indices.new_vector (sel, 2, nelt);
5892           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5893
5894           for (i = 0, j = 0; i < nelt; i++)
5895             if (3 * i + k < 2 * nelt)
5896               sel[i] = i;
5897             else
5898               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5899           indices.new_vector (sel, 2, nelt);
5900           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5901
5902           first_vect = dr_chain[0];
5903           second_vect = dr_chain[1];
5904
5905           /* Create interleaving stmt (low part of):
5906              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5907                                                              ...}>  */
5908           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5909           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5910                                            second_vect, perm3_mask_low);
5911           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5912
5913           /* Create interleaving stmt (high part of):
5914              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5915                                                               ...}>  */
5916           first_vect = data_ref;
5917           second_vect = dr_chain[2];
5918           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5919           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5920                                            second_vect, perm3_mask_high);
5921           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5922           (*result_chain)[k] = data_ref;
5923         }
5924     }
5925   else
5926     {
5927       /* If length is not equal to 3 then only power of 2 is supported.  */
5928       gcc_assert (pow2p_hwi (length));
5929
5930       /* The encoding has a single stepped pattern.  */
5931       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5932       vec_perm_builder sel (nelt, 1, 3);
5933       sel.quick_grow (3);
5934       for (i = 0; i < 3; ++i)
5935         sel[i] = i * 2;
5936       vec_perm_indices indices (sel, 2, nelt);
5937       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5938
5939       for (i = 0; i < 3; ++i)
5940         sel[i] = i * 2 + 1;
5941       indices.new_vector (sel, 2, nelt);
5942       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5943
5944       for (i = 0; i < log_length; i++)
5945         {
5946           for (j = 0; j < length; j += 2)
5947             {
5948               first_vect = dr_chain[j];
5949               second_vect = dr_chain[j+1];
5950
5951               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5952               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5953               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5954                                                first_vect, second_vect,
5955                                                perm_mask_even);
5956               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5957               (*result_chain)[j/2] = data_ref;
5958
5959               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5960               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5961               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5962                                                first_vect, second_vect,
5963                                                perm_mask_odd);
5964               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5965               (*result_chain)[j/2+length/2] = data_ref;
5966             }
5967           memcpy (dr_chain.address (), result_chain->address (),
5968                   length * sizeof (tree));
5969         }
5970     }
5971 }
5972
5973 /* Function vect_shift_permute_load_chain.
5974
5975    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5976    sequence of stmts to reorder the input data accordingly.
5977    Return the final references for loads in RESULT_CHAIN.
5978    Return true if successed, false otherwise.
5979
5980    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5981    The input is 3 vectors each containing 8 elements.  We assign a
5982    number to each element, the input sequence is:
5983
5984    1st vec:   0  1  2  3  4  5  6  7
5985    2nd vec:   8  9 10 11 12 13 14 15
5986    3rd vec:  16 17 18 19 20 21 22 23
5987
5988    The output sequence should be:
5989
5990    1st vec:  0 3 6  9 12 15 18 21
5991    2nd vec:  1 4 7 10 13 16 19 22
5992    3rd vec:  2 5 8 11 14 17 20 23
5993
5994    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5995
5996    First we shuffle all 3 vectors to get correct elements order:
5997
5998    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5999    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6000    3rd vec:  (16 19 22) (17 20 23) (18 21)
6001
6002    Next we unite and shift vector 3 times:
6003
6004    1st step:
6005      shift right by 6 the concatenation of:
6006      "1st vec" and  "2nd vec"
6007        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6008      "2nd vec" and  "3rd vec"
6009        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6010      "3rd vec" and  "1st vec"
6011        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6012                              | New vectors                   |
6013
6014      So that now new vectors are:
6015
6016      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6017      2nd vec:  (10 13) (16 19 22) (17 20 23)
6018      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6019
6020    2nd step:
6021      shift right by 5 the concatenation of:
6022      "1st vec" and  "3rd vec"
6023        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6024      "2nd vec" and  "1st vec"
6025        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6026      "3rd vec" and  "2nd vec"
6027        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6028                           | New vectors                   |
6029
6030      So that now new vectors are:
6031
6032      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6033      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6034      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6035
6036    3rd step:
6037      shift right by 5 the concatenation of:
6038      "1st vec" and  "1st vec"
6039        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6040      shift right by 3 the concatenation of:
6041      "2nd vec" and  "2nd vec"
6042                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6043                           | New vectors                   |
6044
6045      So that now all vectors are READY:
6046      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6047      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6048      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6049
6050    This algorithm is faster than one in vect_permute_load_chain if:
6051      1.  "shift of a concatination" is faster than general permutation.
6052          This is usually so.
6053      2.  The TARGET machine can't execute vector instructions in parallel.
6054          This is because each step of the algorithm depends on previous.
6055          The algorithm in vect_permute_load_chain is much more parallel.
6056
6057    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6058 */
6059
6060 static bool
6061 vect_shift_permute_load_chain (vec<tree> dr_chain,
6062                                unsigned int length,
6063                                gimple *stmt,
6064                                gimple_stmt_iterator *gsi,
6065                                vec<tree> *result_chain)
6066 {
6067   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6068   tree perm2_mask1, perm2_mask2, perm3_mask;
6069   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6070   gimple *perm_stmt;
6071
6072   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6073   unsigned int i;
6074   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6075   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6076
6077   unsigned HOST_WIDE_INT nelt, vf;
6078   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6079       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6080     /* Not supported for variable-length vectors.  */
6081     return false;
6082
6083   vec_perm_builder sel (nelt, nelt, 1);
6084   sel.quick_grow (nelt);
6085
6086   result_chain->quick_grow (length);
6087   memcpy (result_chain->address (), dr_chain.address (),
6088           length * sizeof (tree));
6089
6090   if (pow2p_hwi (length) && vf > 4)
6091     {
6092       unsigned int j, log_length = exact_log2 (length);
6093       for (i = 0; i < nelt / 2; ++i)
6094         sel[i] = i * 2;
6095       for (i = 0; i < nelt / 2; ++i)
6096         sel[nelt / 2 + i] = i * 2 + 1;
6097       vec_perm_indices indices (sel, 2, nelt);
6098       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6099         {
6100           if (dump_enabled_p ())
6101             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6102                              "shuffle of 2 fields structure is not \
6103                               supported by target\n");
6104           return false;
6105         }
6106       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6107
6108       for (i = 0; i < nelt / 2; ++i)
6109         sel[i] = i * 2 + 1;
6110       for (i = 0; i < nelt / 2; ++i)
6111         sel[nelt / 2 + i] = i * 2;
6112       indices.new_vector (sel, 2, nelt);
6113       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114         {
6115           if (dump_enabled_p ())
6116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117                              "shuffle of 2 fields structure is not \
6118                               supported by target\n");
6119           return false;
6120         }
6121       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6122
6123       /* Generating permutation constant to shift all elements.
6124          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6125       for (i = 0; i < nelt; i++)
6126         sel[i] = nelt / 2 + i;
6127       indices.new_vector (sel, 2, nelt);
6128       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129         {
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132                              "shift permutation is not supported by target\n");
6133           return false;
6134         }
6135       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6136
6137       /* Generating permutation constant to select vector from 2.
6138          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6139       for (i = 0; i < nelt / 2; i++)
6140         sel[i] = i;
6141       for (i = nelt / 2; i < nelt; i++)
6142         sel[i] = nelt + i;
6143       indices.new_vector (sel, 2, nelt);
6144       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145         {
6146           if (dump_enabled_p ())
6147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148                              "select is not supported by target\n");
6149           return false;
6150         }
6151       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6152
6153       for (i = 0; i < log_length; i++)
6154         {
6155           for (j = 0; j < length; j += 2)
6156             {
6157               first_vect = dr_chain[j];
6158               second_vect = dr_chain[j + 1];
6159
6160               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6161               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6162                                                first_vect, first_vect,
6163                                                perm2_mask1);
6164               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6165               vect[0] = data_ref;
6166
6167               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6168               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169                                                second_vect, second_vect,
6170                                                perm2_mask2);
6171               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6172               vect[1] = data_ref;
6173
6174               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6175               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176                                                vect[0], vect[1], shift1_mask);
6177               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6178               (*result_chain)[j/2 + length/2] = data_ref;
6179
6180               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6181               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182                                                vect[0], vect[1], select_mask);
6183               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6184               (*result_chain)[j/2] = data_ref;
6185             }
6186           memcpy (dr_chain.address (), result_chain->address (),
6187                   length * sizeof (tree));
6188         }
6189       return true;
6190     }
6191   if (length == 3 && vf > 2)
6192     {
6193       unsigned int k = 0, l = 0;
6194
6195       /* Generating permutation constant to get all elements in rigth order.
6196          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6197       for (i = 0; i < nelt; i++)
6198         {
6199           if (3 * k + (l % 3) >= nelt)
6200             {
6201               k = 0;
6202               l += (3 - (nelt % 3));
6203             }
6204           sel[i] = 3 * k + (l % 3);
6205           k++;
6206         }
6207       vec_perm_indices indices (sel, 2, nelt);
6208       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6209         {
6210           if (dump_enabled_p ())
6211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212                              "shuffle of 3 fields structure is not \
6213                               supported by target\n");
6214           return false;
6215         }
6216       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6217
6218       /* Generating permutation constant to shift all elements.
6219          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6220       for (i = 0; i < nelt; i++)
6221         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6222       indices.new_vector (sel, 2, nelt);
6223       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6224         {
6225           if (dump_enabled_p ())
6226             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227                              "shift permutation is not supported by target\n");
6228           return false;
6229         }
6230       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6231
6232       /* Generating permutation constant to shift all elements.
6233          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6234       for (i = 0; i < nelt; i++)
6235         sel[i] = 2 * (nelt / 3) + 1 + i;
6236       indices.new_vector (sel, 2, nelt);
6237       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6238         {
6239           if (dump_enabled_p ())
6240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241                              "shift permutation is not supported by target\n");
6242           return false;
6243         }
6244       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6245
6246       /* Generating permutation constant to shift all elements.
6247          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6248       for (i = 0; i < nelt; i++)
6249         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6250       indices.new_vector (sel, 2, nelt);
6251       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252         {
6253           if (dump_enabled_p ())
6254             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255                              "shift permutation is not supported by target\n");
6256           return false;
6257         }
6258       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259
6260       /* Generating permutation constant to shift all elements.
6261          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6262       for (i = 0; i < nelt; i++)
6263         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6264       indices.new_vector (sel, 2, nelt);
6265       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266         {
6267           if (dump_enabled_p ())
6268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269                              "shift permutation is not supported by target\n");
6270           return false;
6271         }
6272       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6273
6274       for (k = 0; k < 3; k++)
6275         {
6276           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6277           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6278                                            dr_chain[k], dr_chain[k],
6279                                            perm3_mask);
6280           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6281           vect[k] = data_ref;
6282         }
6283
6284       for (k = 0; k < 3; k++)
6285         {
6286           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6287           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288                                            vect[k % 3], vect[(k + 1) % 3],
6289                                            shift1_mask);
6290           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6291           vect_shift[k] = data_ref;
6292         }
6293
6294       for (k = 0; k < 3; k++)
6295         {
6296           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6297           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6298                                            vect_shift[(4 - k) % 3],
6299                                            vect_shift[(3 - k) % 3],
6300                                            shift2_mask);
6301           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6302           vect[k] = data_ref;
6303         }
6304
6305       (*result_chain)[3 - (nelt % 3)] = vect[2];
6306
6307       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6308       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6309                                        vect[0], shift3_mask);
6310       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6311       (*result_chain)[nelt % 3] = data_ref;
6312
6313       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6314       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6315                                        vect[1], shift4_mask);
6316       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317       (*result_chain)[0] = data_ref;
6318       return true;
6319     }
6320   return false;
6321 }
6322
6323 /* Function vect_transform_grouped_load.
6324
6325    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6326    to perform their permutation and ascribe the result vectorized statements to
6327    the scalar statements.
6328 */
6329
6330 void
6331 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6332                              gimple_stmt_iterator *gsi)
6333 {
6334   machine_mode mode;
6335   vec<tree> result_chain = vNULL;
6336
6337   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6338      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6339      vectors, that are ready for vector computation.  */
6340   result_chain.create (size);
6341
6342   /* If reassociation width for vector type is 2 or greater target machine can
6343      execute 2 or more vector instructions in parallel.  Otherwise try to
6344      get chain for loads group using vect_shift_permute_load_chain.  */
6345   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6346   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6347       || pow2p_hwi (size)
6348       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6349                                          gsi, &result_chain))
6350     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6351   vect_record_grouped_load_vectors (stmt, result_chain);
6352   result_chain.release ();
6353 }
6354
6355 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6356    generated as part of the vectorization of STMT.  Assign the statement
6357    for each vector to the associated scalar statement.  */
6358
6359 void
6360 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6361 {
6362   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6363   gimple *next_stmt, *new_stmt;
6364   unsigned int i, gap_count;
6365   tree tmp_data_ref;
6366
6367   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6368      Since we scan the chain starting from it's first node, their order
6369      corresponds the order of data-refs in RESULT_CHAIN.  */
6370   next_stmt = first_stmt;
6371   gap_count = 1;
6372   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6373     {
6374       if (!next_stmt)
6375         break;
6376
6377       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6378        code elimination pass later.  No need to check for the first stmt in
6379        the group, since it always exists.
6380        DR_GROUP_GAP is the number of steps in elements from the previous
6381        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6382        correspond to the gaps.  */
6383       if (next_stmt != first_stmt
6384           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6385       {
6386         gap_count++;
6387         continue;
6388       }
6389
6390       while (next_stmt)
6391         {
6392           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6393           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6394              copies, and we put the new vector statement in the first available
6395              RELATED_STMT.  */
6396           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6397             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6398           else
6399             {
6400               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6401                 {
6402                   gimple *prev_stmt =
6403                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6404                   gimple *rel_stmt =
6405                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6406                   while (rel_stmt)
6407                     {
6408                       prev_stmt = rel_stmt;
6409                       rel_stmt =
6410                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6411                     }
6412
6413                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6414                     new_stmt;
6415                 }
6416             }
6417
6418           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6419           gap_count = 1;
6420           /* If NEXT_STMT accesses the same DR as the previous statement,
6421              put the same TMP_DATA_REF as its vectorized statement; otherwise
6422              get the next data-ref from RESULT_CHAIN.  */
6423           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6424             break;
6425         }
6426     }
6427 }
6428
6429 /* Function vect_force_dr_alignment_p.
6430
6431    Returns whether the alignment of a DECL can be forced to be aligned
6432    on ALIGNMENT bit boundary.  */
6433
6434 bool
6435 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6436 {
6437   if (!VAR_P (decl))
6438     return false;
6439
6440   if (decl_in_symtab_p (decl)
6441       && !symtab_node::get (decl)->can_increase_alignment_p ())
6442     return false;
6443
6444   if (TREE_STATIC (decl))
6445     return (alignment <= MAX_OFILE_ALIGNMENT);
6446   else
6447     return (alignment <= MAX_STACK_ALIGNMENT);
6448 }
6449
6450
6451 /* Return whether the data reference DR is supported with respect to its
6452    alignment.
6453    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6454    it is aligned, i.e., check if it is possible to vectorize it with different
6455    alignment.  */
6456
6457 enum dr_alignment_support
6458 vect_supportable_dr_alignment (struct data_reference *dr,
6459                                bool check_aligned_accesses)
6460 {
6461   gimple *stmt = vect_dr_stmt (dr);
6462   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6463   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6464   machine_mode mode = TYPE_MODE (vectype);
6465   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6466   struct loop *vect_loop = NULL;
6467   bool nested_in_vect_loop = false;
6468
6469   if (aligned_access_p (dr) && !check_aligned_accesses)
6470     return dr_aligned;
6471
6472   /* For now assume all conditional loads/stores support unaligned
6473      access without any special code.  */
6474   if (is_gimple_call (stmt)
6475       && gimple_call_internal_p (stmt)
6476       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6477           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6478     return dr_unaligned_supported;
6479
6480   if (loop_vinfo)
6481     {
6482       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6483       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6484     }
6485
6486   /* Possibly unaligned access.  */
6487
6488   /* We can choose between using the implicit realignment scheme (generating
6489      a misaligned_move stmt) and the explicit realignment scheme (generating
6490      aligned loads with a REALIGN_LOAD).  There are two variants to the
6491      explicit realignment scheme: optimized, and unoptimized.
6492      We can optimize the realignment only if the step between consecutive
6493      vector loads is equal to the vector size.  Since the vector memory
6494      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6495      is guaranteed that the misalignment amount remains the same throughout the
6496      execution of the vectorized loop.  Therefore, we can create the
6497      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6498      at the loop preheader.
6499
6500      However, in the case of outer-loop vectorization, when vectorizing a
6501      memory access in the inner-loop nested within the LOOP that is now being
6502      vectorized, while it is guaranteed that the misalignment of the
6503      vectorized memory access will remain the same in different outer-loop
6504      iterations, it is *not* guaranteed that is will remain the same throughout
6505      the execution of the inner-loop.  This is because the inner-loop advances
6506      with the original scalar step (and not in steps of VS).  If the inner-loop
6507      step happens to be a multiple of VS, then the misalignment remains fixed
6508      and we can use the optimized realignment scheme.  For example:
6509
6510       for (i=0; i<N; i++)
6511         for (j=0; j<M; j++)
6512           s += a[i+j];
6513
6514      When vectorizing the i-loop in the above example, the step between
6515      consecutive vector loads is 1, and so the misalignment does not remain
6516      fixed across the execution of the inner-loop, and the realignment cannot
6517      be optimized (as illustrated in the following pseudo vectorized loop):
6518
6519       for (i=0; i<N; i+=4)
6520         for (j=0; j<M; j++){
6521           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6522                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6523                          // (assuming that we start from an aligned address).
6524           }
6525
6526      We therefore have to use the unoptimized realignment scheme:
6527
6528       for (i=0; i<N; i+=4)
6529           for (j=k; j<M; j+=4)
6530           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6531                            // that the misalignment of the initial address is
6532                            // 0).
6533
6534      The loop can then be vectorized as follows:
6535
6536       for (k=0; k<4; k++){
6537         rt = get_realignment_token (&vp[k]);
6538         for (i=0; i<N; i+=4){
6539           v1 = vp[i+k];
6540           for (j=k; j<M; j+=4){
6541             v2 = vp[i+j+VS-1];
6542             va = REALIGN_LOAD <v1,v2,rt>;
6543             vs += va;
6544             v1 = v2;
6545           }
6546         }
6547     } */
6548
6549   if (DR_IS_READ (dr))
6550     {
6551       bool is_packed = false;
6552       tree type = (TREE_TYPE (DR_REF (dr)));
6553
6554       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6555           && (!targetm.vectorize.builtin_mask_for_load
6556               || targetm.vectorize.builtin_mask_for_load ()))
6557         {
6558           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6559
6560           /* If we are doing SLP then the accesses need not have the
6561              same alignment, instead it depends on the SLP group size.  */
6562           if (loop_vinfo
6563               && STMT_SLP_TYPE (stmt_info)
6564               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6565                               * DR_GROUP_SIZE (vinfo_for_stmt
6566                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6567                               TYPE_VECTOR_SUBPARTS (vectype)))
6568             ;
6569           else if (!loop_vinfo
6570                    || (nested_in_vect_loop
6571                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6572                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6573             return dr_explicit_realign;
6574           else
6575             return dr_explicit_realign_optimized;
6576         }
6577       if (!known_alignment_for_access_p (dr))
6578         is_packed = not_size_aligned (DR_REF (dr));
6579
6580       if (targetm.vectorize.support_vector_misalignment
6581             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6582         /* Can't software pipeline the loads, but can at least do them.  */
6583         return dr_unaligned_supported;
6584     }
6585   else
6586     {
6587       bool is_packed = false;
6588       tree type = (TREE_TYPE (DR_REF (dr)));
6589
6590       if (!known_alignment_for_access_p (dr))
6591         is_packed = not_size_aligned (DR_REF (dr));
6592
6593      if (targetm.vectorize.support_vector_misalignment
6594            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6595        return dr_unaligned_supported;
6596     }
6597
6598   /* Unsupported.  */
6599   return dr_unaligned_unsupported;
6600 }