gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 217 }
 218
 219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 221    distances.  These distances are conservatively correct but they don't
 222    reflect a guaranteed dependence.
 223
 224    Return true if this function does all the work necessary to avoid
 225    an alias or false if the caller should use the dependence distances
 226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 227    the depth of the loop described by LOOP_VINFO and the other arguments
 228    are as for vect_analyze_data_ref_dependence.  */
 229
 230 static bool
 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 232                                        loop_vec_info loop_vinfo,
 233                                        int loop_depth, unsigned int *max_vf)
 234 {
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   lambda_vector dist_v;
 237   unsigned int i;
 238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 239     {
 240       int dist = dist_v[loop_depth];
 241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 242         {
 243           /* If the user asserted safelen >= DIST consecutive iterations
 244              can be executed concurrently, assume independence.
 245
 246              ??? An alternative would be to add the alias check even
 247              in this case, and vectorize the fallback loop with the
 248              maximum VF set to safelen.  However, if the user has
 249              explicitly given a length, it's less likely that that
 250              would be a win.  */
 251           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 252             {
 253               if ((unsigned int) loop->safelen < *max_vf)
 254                 *max_vf = loop->safelen;
 255               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 256               continue;
 257             }
 258
 259           /* For dependence distances of 2 or more, we have the option
 260              of limiting VF or checking for an alias at runtime.
 261              Prefer to check at runtime if we can, to avoid limiting
 262              the VF unnecessarily when the bases are in fact independent.
 263
 264              Note that the alias checks will be removed if the VF ends up
 265              being small enough.  */
 266           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 267         }
 268     }
 269   return true;
 270 }
 271
 272
 273 /* Function vect_analyze_data_ref_dependence.
 274
 275    Return TRUE if there (might) exist a dependence between a memory-reference
 276    DRA and a memory-reference DRB.  When versioning for alias may check a
 277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 278    the data dependence.  */
 279
 280 static bool
 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 282                                   loop_vec_info loop_vinfo,
 283                                   unsigned int *max_vf)
 284 {
 285   unsigned int i;
 286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 287   struct data_reference *dra = DDR_A (ddr);
 288   struct data_reference *drb = DDR_B (ddr);
 289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 291   lambda_vector dist_v;
 292   unsigned int loop_depth;
 293
 294   /* In loop analysis all data references should be vectorizable.  */
 295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 297     gcc_unreachable ();
 298
 299   /* Independent data accesses.  */
 300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 301     return false;
 302
 303   if (dra == drb
 304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 305     return false;
 306
 307   /* We do not have to consider dependences between accesses that belong
 308      to the same group.  */
 309   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 310       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 311     return false;
 312
 313   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 314      least two scalar iterations, there is always also a true dependence.
 315      As the vectorizer does not re-order loads and stores we can ignore
 316      the anti-dependence if TBAA can disambiguate both DRs similar to the
 317      case with known negative distance anti-dependences (positive
 318      distance anti-dependences would violate TBAA constraints).  */
 319   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 320        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 321       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 322                                  get_alias_set (DR_REF (drb))))
 323     return false;
 324
 325   /* Unknown data dependence.  */
 326   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 327     {
 328       /* If user asserted safelen consecutive iterations can be
 329          executed concurrently, assume independence.  */
 330       if (loop->safelen >= 2)
 331         {
 332           if ((unsigned int) loop->safelen < *max_vf)
 333             *max_vf = loop->safelen;
 334           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 335           return false;
 336         }
 337
 338       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 339           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 340         {
 341           if (dump_enabled_p ())
 342             {
 343               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 344                                "versioning for alias not supported for: "
 345                                "can't determine dependence between ");
 346               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 347                                  DR_REF (dra));
 348               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 349               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 350                                  DR_REF (drb));
 351               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 352             }
 353           return true;
 354         }
 355
 356       if (dump_enabled_p ())
 357         {
 358           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 359                            "versioning for alias required: "
 360                            "can't determine dependence between ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 362                              DR_REF (dra));
 363           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 364           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 365                              DR_REF (drb));
 366           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 367         }
 368
 369       /* Add to list of ddrs that need to be tested at run-time.  */
 370       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 371     }
 372
 373   /* Known data dependence.  */
 374   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 375     {
 376       /* If user asserted safelen consecutive iterations can be
 377          executed concurrently, assume independence.  */
 378       if (loop->safelen >= 2)
 379         {
 380           if ((unsigned int) loop->safelen < *max_vf)
 381             *max_vf = loop->safelen;
 382           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 383           return false;
 384         }
 385
 386       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 387           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 388         {
 389           if (dump_enabled_p ())
 390             {
 391               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 392                                "versioning for alias not supported for: "
 393                                "bad dist vector for ");
 394               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 395                                  DR_REF (dra));
 396               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 397               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 398                                  DR_REF (drb));
 399               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 400             }
 401           return true;
 402         }
 403
 404       if (dump_enabled_p ())
 405         {
 406           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                            "versioning for alias required: "
 408                            "bad dist vector for ");
 409           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 410           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 411           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 412           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 413         }
 414       /* Add to list of ddrs that need to be tested at run-time.  */
 415       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 416     }
 417
 418   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 419
 420   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 421       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 422                                                 loop_depth, max_vf))
 423     return false;
 424
 425   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 426     {
 427       int dist = dist_v[loop_depth];
 428
 429       if (dump_enabled_p ())
 430         dump_printf_loc (MSG_NOTE, vect_location,
 431                          "dependence distance  = %d.\n", dist);
 432
 433       if (dist == 0)
 434         {
 435           if (dump_enabled_p ())
 436             {
 437               dump_printf_loc (MSG_NOTE, vect_location,
 438                                "dependence distance == 0 between ");
 439               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 440               dump_printf (MSG_NOTE, " and ");
 441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 442               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 443             }
 444
 445           /* When we perform grouped accesses and perform implicit CSE
 446              by detecting equal accesses and doing disambiguation with
 447              runtime alias tests like for
 448                 .. = a[i];
 449                 .. = a[i+1];
 450                 a[i] = ..;
 451                 a[i+1] = ..;
 452                 *p = ..;
 453                 .. = a[i];
 454                 .. = a[i+1];
 455              where we will end up loading { a[i], a[i+1] } once, make
 456              sure that inserting group loads before the first load and
 457              stores after the last store will do the right thing.
 458              Similar for groups like
 459                 a[i] = ...;
 460                 ... = a[i];
 461                 a[i+1] = ...;
 462              where loads from the group interleave with the store.  */
 463           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 464             {
 465               if (dump_enabled_p ())
 466                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 467                                  "READ_WRITE dependence in interleaving.\n");
 468               return true;
 469             }
 470
 471           if (loop->safelen < 2)
 472             {
 473               tree indicator = dr_zero_step_indicator (dra);
 474               if (TREE_CODE (indicator) != INTEGER_CST)
 475                 vect_check_nonzero_value (loop_vinfo, indicator);
 476               else if (integer_zerop (indicator))
 477                 {
 478                   if (dump_enabled_p ())
 479                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                  "access also has a zero step\n");
 481                   return true;
 482                 }
 483             }
 484           continue;
 485         }
 486
 487       if (dist > 0 && DDR_REVERSED_P (ddr))
 488         {
 489           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 490              reversed (to make distance vector positive), and the actual
 491              distance is negative.  */
 492           if (dump_enabled_p ())
 493             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 494                              "dependence distance negative.\n");
 495           /* Record a negative dependence distance to later limit the
 496              amount of stmt copying / unrolling we can perform.
 497              Only need to handle read-after-write dependence.  */
 498           if (DR_IS_READ (drb)
 499               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 500                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 501             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 502           continue;
 503         }
 504
 505       unsigned int abs_dist = abs (dist);
 506       if (abs_dist >= 2 && abs_dist < *max_vf)
 507         {
 508           /* The dependence distance requires reduction of the maximal
 509              vectorization factor.  */
 510           *max_vf = abs (dist);
 511           if (dump_enabled_p ())
 512             dump_printf_loc (MSG_NOTE, vect_location,
 513                              "adjusting maximal vectorization factor to %i\n",
 514                              *max_vf);
 515         }
 516
 517       if (abs_dist >= *max_vf)
 518         {
 519           /* Dependence distance does not create dependence, as far as
 520              vectorization is concerned, in this case.  */
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "dependence distance >= VF.\n");
 524           continue;
 525         }
 526
 527       if (dump_enabled_p ())
 528         {
 529           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 530                        "not vectorized, possible dependence "
 531                        "between data-refs ");
 532           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 533           dump_printf (MSG_NOTE,  " and ");
 534           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 535           dump_printf (MSG_NOTE,  "\n");
 536         }
 537
 538       return true;
 539     }
 540
 541   return false;
 542 }
 543
 544 /* Function vect_analyze_data_ref_dependences.
 545
 546    Examine all the data references in the loop, and make sure there do not
 547    exist any data dependences between them.  Set *MAX_VF according to
 548    the maximum vectorization factor the data dependences allow.  */
 549
 550 bool
 551 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 552                                    unsigned int *max_vf)
 553 {
 554   unsigned int i;
 555   struct data_dependence_relation *ddr;
 556
 557   if (dump_enabled_p ())
 558     dump_printf_loc (MSG_NOTE, vect_location,
 559                      "=== vect_analyze_data_ref_dependences ===\n");
 560
 561   LOOP_VINFO_DDRS (loop_vinfo)
 562     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 563              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 564   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 565   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 566   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 567                                 &LOOP_VINFO_DDRS (loop_vinfo),
 568                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 569     return false;
 570
 571   /* For epilogues we either have no aliases or alias versioning
 572      was applied to original loop.  Therefore we may just get max_vf
 573      using VF of original loop.  */
 574   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 575     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 576   else
 577     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 578       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 579         return false;
 580
 581   return true;
 582 }
 583
 584
 585 /* Function vect_slp_analyze_data_ref_dependence.
 586
 587    Return TRUE if there (might) exist a dependence between a memory-reference
 588    DRA and a memory-reference DRB.  When versioning for alias may check a
 589    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 590    the data dependence.  */
 591
 592 static bool
 593 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 594 {
 595   struct data_reference *dra = DDR_A (ddr);
 596   struct data_reference *drb = DDR_B (ddr);
 597
 598   /* We need to check dependences of statements marked as unvectorizable
 599      as well, they still can prohibit vectorization.  */
 600
 601   /* Independent data accesses.  */
 602   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 603     return false;
 604
 605   if (dra == drb)
 606     return false;
 607
 608   /* Read-read is OK.  */
 609   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 610     return false;
 611
 612   /* If dra and drb are part of the same interleaving chain consider
 613      them independent.  */
 614   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 615       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 616           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 617     return false;
 618
 619   /* Unknown data dependence.  */
 620   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 621     {
 622       if  (dump_enabled_p ())
 623         {
 624           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 625                            "can't determine dependence between ");
 626           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 627           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 628           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 629           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 630         }
 631     }
 632   else if (dump_enabled_p ())
 633     {
 634       dump_printf_loc (MSG_NOTE, vect_location,
 635                        "determined dependence between ");
 636       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 637       dump_printf (MSG_NOTE, " and ");
 638       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 639       dump_printf (MSG_NOTE,  "\n");
 640     }
 641
 642   return true;
 643 }
 644
 645
 646 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 647    contain the vector of scalar stores of this instance if we are
 648    disambiguating the loads.  */
 649
 650 static bool
 651 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 652                                    vec<gimple *> stores, gimple *last_store)
 653 {
 654   /* This walks over all stmts involved in the SLP load/store done
 655      in NODE verifying we can sink them up to the last stmt in the
 656      group.  */
 657   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 658   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 659     {
 660       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 661       if (access == last_access)
 662         continue;
 663       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 664       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 665            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 666         {
 667           gimple *stmt = gsi_stmt (gsi);
 668           if (! gimple_vuse (stmt)
 669               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 670             continue;
 671
 672           /* If we couldn't record a (single) data reference for this
 673              stmt we have to give up.  */
 674           /* ???  Here and below if dependence analysis fails we can resort
 675              to the alias oracle which can handle more kinds of stmts.  */
 676           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 677           if (!dr_b)
 678             return false;
 679
 680           bool dependent = false;
 681           /* If we run into a store of this same instance (we've just
 682              marked those) then delay dependence checking until we run
 683              into the last store because this is where it will have
 684              been sunk to (and we verify if we can do that as well).  */
 685           if (gimple_visited_p (stmt))
 686             {
 687               if (stmt != last_store)
 688                 continue;
 689               unsigned i;
 690               gimple *store;
 691               FOR_EACH_VEC_ELT (stores, i, store)
 692                 {
 693                   data_reference *store_dr
 694                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 695                   ddr_p ddr = initialize_data_dependence_relation
 696                                 (dr_a, store_dr, vNULL);
 697                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 698                   free_dependence_relation (ddr);
 699                   if (dependent)
 700                     break;
 701                 }
 702             }
 703           else
 704             {
 705               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 706                                                                dr_b, vNULL);
 707               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 708               free_dependence_relation (ddr);
 709             }
 710           if (dependent)
 711             return false;
 712         }
 713     }
 714   return true;
 715 }
 716
 717
 718 /* Function vect_analyze_data_ref_dependences.
 719
 720    Examine all the data references in the basic-block, and make sure there
 721    do not exist any data dependences between them.  Set *MAX_VF according to
 722    the maximum vectorization factor the data dependences allow.  */
 723
 724 bool
 725 vect_slp_analyze_instance_dependence (slp_instance instance)
 726 {
 727   if (dump_enabled_p ())
 728     dump_printf_loc (MSG_NOTE, vect_location,
 729                      "=== vect_slp_analyze_instance_dependence ===\n");
 730
 731   /* The stores of this instance are at the root of the SLP tree.  */
 732   slp_tree store = SLP_INSTANCE_TREE (instance);
 733   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 734     store = NULL;
 735
 736   /* Verify we can sink stores to the vectorized stmt insert location.  */
 737   gimple *last_store = NULL;
 738   if (store)
 739     {
 740       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 741         return false;
 742
 743       /* Mark stores in this instance and remember the last one.  */
 744       last_store = vect_find_last_scalar_stmt_in_slp (store);
 745       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 746         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 747     }
 748
 749   bool res = true;
 750
 751   /* Verify we can sink loads to the vectorized stmt insert location,
 752      special-casing stores of this instance.  */
 753   slp_tree load;
 754   unsigned int i;
 755   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 756     if (! vect_slp_analyze_node_dependences (instance, load,
 757                                              store
 758                                              ? SLP_TREE_SCALAR_STMTS (store)
 759                                              : vNULL, last_store))
 760       {
 761         res = false;
 762         break;
 763       }
 764
 765   /* Unset the visited flag.  */
 766   if (store)
 767     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 768       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 769
 770   return res;
 771 }
 772
 773 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 774    the statement that contains DRB, which is useful for recording in the
 775    dump file.  */
 776
 777 static void
 778 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 779                             innermost_loop_behavior *drb)
 780 {
 781   bool existed;
 782   innermost_loop_behavior *&entry
 783     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 784   if (!existed || entry->base_alignment < drb->base_alignment)
 785     {
 786       entry = drb;
 787       if (dump_enabled_p ())
 788         {
 789           dump_printf_loc (MSG_NOTE, vect_location,
 790                            "recording new base alignment for ");
 791           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 792           dump_printf (MSG_NOTE, "\n");
 793           dump_printf_loc (MSG_NOTE, vect_location,
 794                            "  alignment:    %d\n", drb->base_alignment);
 795           dump_printf_loc (MSG_NOTE, vect_location,
 796                            "  misalignment: %d\n", drb->base_misalignment);
 797           dump_printf_loc (MSG_NOTE, vect_location,
 798                            "  based on:     ");
 799           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 800         }
 801     }
 802 }
 803
 804 /* If the region we're going to vectorize is reached, all unconditional
 805    data references occur at least once.  We can therefore pool the base
 806    alignment guarantees from each unconditional reference.  Do this by
 807    going through all the data references in VINFO and checking whether
 808    the containing statement makes the reference unconditionally.  If so,
 809    record the alignment of the base address in VINFO so that it can be
 810    used for all other references with the same base.  */
 811
 812 void
 813 vect_record_base_alignments (vec_info *vinfo)
 814 {
 815   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 816   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 817   data_reference *dr;
 818   unsigned int i;
 819   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 820     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 821       {
 822         gimple *stmt = DR_STMT (dr);
 823         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 824
 825         /* If DR is nested in the loop that is being vectorized, we can also
 826            record the alignment of the base wrt the outer loop.  */
 827         if (loop && nested_in_vect_loop_p (loop, stmt))
 828           {
 829             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 830             vect_record_base_alignment
 831               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 832           }
 833       }
 834 }
 835
 836 /* Return the target alignment for the vectorized form of DR.  */
 837
 838 static unsigned int
 839 vect_calculate_target_alignment (struct data_reference *dr)
 840 {
 841   gimple *stmt = DR_STMT (dr);
 842   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 843   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 844   return targetm.vectorize.preferred_vector_alignment (vectype);
 845 }
 846
 847 /* Function vect_compute_data_ref_alignment
 848
 849    Compute the misalignment of the data reference DR.
 850
 851    Output:
 852    1. If during the misalignment computation it is found that the data reference
 853       cannot be vectorized then false is returned.
 854    2. DR_MISALIGNMENT (DR) is defined.
 855
 856    FOR NOW: No analysis is actually performed. Misalignment is calculated
 857    only for trivial cases. TODO.  */
 858
 859 bool
 860 vect_compute_data_ref_alignment (struct data_reference *dr)
 861 {
 862   gimple *stmt = DR_STMT (dr);
 863   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 864   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 865   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 866   struct loop *loop = NULL;
 867   tree ref = DR_REF (dr);
 868   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 869
 870   if (dump_enabled_p ())
 871     dump_printf_loc (MSG_NOTE, vect_location,
 872                      "vect_compute_data_ref_alignment:\n");
 873
 874   if (loop_vinfo)
 875     loop = LOOP_VINFO_LOOP (loop_vinfo);
 876
 877   /* Initialize misalignment to unknown.  */
 878   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 879
 880   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 881   bool step_preserves_misalignment_p;
 882
 883   unsigned HOST_WIDE_INT vector_alignment
 884     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 885   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 886
 887   /* No step for BB vectorization.  */
 888   if (!loop)
 889     {
 890       gcc_assert (integer_zerop (drb->step));
 891       step_preserves_misalignment_p = true;
 892     }
 893
 894   /* In case the dataref is in an inner-loop of the loop that is being
 895      vectorized (LOOP), we use the base and misalignment information
 896      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 897      stays the same throughout the execution of the inner-loop, which is why
 898      we have to check that the stride of the dataref in the inner-loop evenly
 899      divides by the vector alignment.  */
 900   else if (nested_in_vect_loop_p (loop, stmt))
 901     {
 902       step_preserves_misalignment_p
 903         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 904
 905       if (dump_enabled_p ())
 906         {
 907           if (step_preserves_misalignment_p)
 908             dump_printf_loc (MSG_NOTE, vect_location,
 909                              "inner step divides the vector alignment.\n");
 910           else
 911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                              "inner step doesn't divide the vector"
 913                              " alignment.\n");
 914         }
 915     }
 916
 917   /* Similarly we can only use base and misalignment information relative to
 918      an innermost loop if the misalignment stays the same throughout the
 919      execution of the loop.  As above, this is the case if the stride of
 920      the dataref evenly divides by the alignment.  */
 921   else
 922     {
 923       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 924       step_preserves_misalignment_p
 925         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 926
 927       if (!step_preserves_misalignment_p && dump_enabled_p ())
 928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 929                          "step doesn't divide the vector alignment.\n");
 930     }
 931
 932   unsigned int base_alignment = drb->base_alignment;
 933   unsigned int base_misalignment = drb->base_misalignment;
 934
 935   /* Calculate the maximum of the pooled base address alignment and the
 936      alignment that we can compute for DR itself.  */
 937   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 938   if (entry && base_alignment < (*entry)->base_alignment)
 939     {
 940       base_alignment = (*entry)->base_alignment;
 941       base_misalignment = (*entry)->base_misalignment;
 942     }
 943
 944   if (drb->offset_alignment < vector_alignment
 945       || !step_preserves_misalignment_p
 946       /* We need to know whether the step wrt the vectorized loop is
 947          negative when computing the starting misalignment below.  */
 948       || TREE_CODE (drb->step) != INTEGER_CST)
 949     {
 950       if (dump_enabled_p ())
 951         {
 952           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 953                            "Unknown alignment for access: ");
 954           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 955           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 956         }
 957       return true;
 958     }
 959
 960   if (base_alignment < vector_alignment)
 961     {
 962       unsigned int max_alignment;
 963       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 964       if (max_alignment < vector_alignment
 965           || !vect_can_force_dr_alignment_p (base,
 966                                              vector_alignment * BITS_PER_UNIT))
 967         {
 968           if (dump_enabled_p ())
 969             {
 970               dump_printf_loc (MSG_NOTE, vect_location,
 971                                "can't force alignment of ref: ");
 972               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 973               dump_printf (MSG_NOTE, "\n");
 974             }
 975           return true;
 976         }
 977
 978       /* Force the alignment of the decl.
 979          NOTE: This is the only change to the code we make during
 980          the analysis phase, before deciding to vectorize the loop.  */
 981       if (dump_enabled_p ())
 982         {
 983           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 984           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 985           dump_printf (MSG_NOTE, "\n");
 986         }
 987
 988       DR_VECT_AUX (dr)->base_decl = base;
 989       DR_VECT_AUX (dr)->base_misaligned = true;
 990       base_misalignment = 0;
 991     }
 992   poly_int64 misalignment
 993     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 994
 995   /* If this is a backward running DR then first access in the larger
 996      vectype actually is N-1 elements before the address in the DR.
 997      Adjust misalign accordingly.  */
 998   if (tree_int_cst_sgn (drb->step) < 0)
 999     /* PLUS because STEP is negative.  */
1000     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1001                      * TREE_INT_CST_LOW (drb->step));
1002
1003   unsigned int const_misalignment;
1004   if (!known_misalignment (misalignment, vector_alignment,
1005                            &const_misalignment))
1006     {
1007       if (dump_enabled_p ())
1008         {
1009           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                            "Non-constant misalignment for access: ");
1011           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1012           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1013         }
1014       return true;
1015     }
1016
1017   SET_DR_MISALIGNMENT (dr, const_misalignment);
1018
1019   if (dump_enabled_p ())
1020     {
1021       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1023       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1025     }
1026
1027   return true;
1028 }
1029
1030 /* Function vect_update_misalignment_for_peel.
1031    Sets DR's misalignment
1032    - to 0 if it has the same alignment as DR_PEEL,
1033    - to the misalignment computed using NPEEL if DR's salignment is known,
1034    - to -1 (unknown) otherwise.
1035
1036    DR - the data reference whose misalignment is to be adjusted.
1037    DR_PEEL - the data reference whose misalignment is being made
1038              zero in the vector loop by the peel.
1039    NPEEL - the number of iterations in the peel loop if the misalignment
1040            of DR_PEEL is known at compile time.  */
1041
1042 static void
1043 vect_update_misalignment_for_peel (struct data_reference *dr,
1044                                    struct data_reference *dr_peel, int npeel)
1045 {
1046   unsigned int i;
1047   vec<dr_p> same_aligned_drs;
1048   struct data_reference *current_dr;
1049   int dr_size = vect_get_scalar_dr_size (dr);
1050   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1051   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1052   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1053
1054  /* For interleaved data accesses the step in the loop must be multiplied by
1055      the size of the interleaving group.  */
1056   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1057     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1058   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1059     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1060
1061   /* It can be assumed that the data refs with the same alignment as dr_peel
1062      are aligned in the vector loop.  */
1063   same_aligned_drs
1064     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1065   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1066     {
1067       if (current_dr != dr)
1068         continue;
1069       gcc_assert (!known_alignment_for_access_p (dr)
1070                   || !known_alignment_for_access_p (dr_peel)
1071                   || (DR_MISALIGNMENT (dr) / dr_size
1072                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1073       SET_DR_MISALIGNMENT (dr, 0);
1074       return;
1075     }
1076
1077   if (known_alignment_for_access_p (dr)
1078       && known_alignment_for_access_p (dr_peel))
1079     {
1080       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1081       int misal = DR_MISALIGNMENT (dr);
1082       misal += negative ? -npeel * dr_size : npeel * dr_size;
1083       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1084       SET_DR_MISALIGNMENT (dr, misal);
1085       return;
1086     }
1087
1088   if (dump_enabled_p ())
1089     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1090                      "to unknown (-1).\n");
1091   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1092 }
1093
1094
1095 /* Function verify_data_ref_alignment
1096
1097    Return TRUE if DR can be handled with respect to alignment.  */
1098
1099 static bool
1100 verify_data_ref_alignment (data_reference_p dr)
1101 {
1102   enum dr_alignment_support supportable_dr_alignment
1103     = vect_supportable_dr_alignment (dr, false);
1104   if (!supportable_dr_alignment)
1105     {
1106       if (dump_enabled_p ())
1107         {
1108           if (DR_IS_READ (dr))
1109             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110                              "not vectorized: unsupported unaligned load.");
1111           else
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "not vectorized: unsupported unaligned "
1114                              "store.");
1115
1116           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1117                              DR_REF (dr));
1118           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1119         }
1120       return false;
1121     }
1122
1123   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1124     dump_printf_loc (MSG_NOTE, vect_location,
1125                      "Vectorizing an unaligned access.\n");
1126
1127   return true;
1128 }
1129
1130 /* Function vect_verify_datarefs_alignment
1131
1132    Return TRUE if all data references in the loop can be
1133    handled with respect to alignment.  */
1134
1135 bool
1136 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1137 {
1138   vec<data_reference_p> datarefs = vinfo->datarefs;
1139   struct data_reference *dr;
1140   unsigned int i;
1141
1142   FOR_EACH_VEC_ELT (datarefs, i, dr)
1143     {
1144       gimple *stmt = DR_STMT (dr);
1145       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1146
1147       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1148         continue;
1149
1150       /* For interleaving, only the alignment of the first access matters.   */
1151       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1152           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1153         continue;
1154
1155       /* Strided accesses perform only component accesses, alignment is
1156          irrelevant for them.  */
1157       if (STMT_VINFO_STRIDED_P (stmt_info)
1158           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1159         continue;
1160
1161       if (! verify_data_ref_alignment (dr))
1162         return false;
1163     }
1164
1165   return true;
1166 }
1167
1168 /* Given an memory reference EXP return whether its alignment is less
1169    than its size.  */
1170
1171 static bool
1172 not_size_aligned (tree exp)
1173 {
1174   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1175     return true;
1176
1177   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1178           > get_object_alignment (exp));
1179 }
1180
1181 /* Function vector_alignment_reachable_p
1182
1183    Return true if vector alignment for DR is reachable by peeling
1184    a few loop iterations.  Return false otherwise.  */
1185
1186 static bool
1187 vector_alignment_reachable_p (struct data_reference *dr)
1188 {
1189   gimple *stmt = DR_STMT (dr);
1190   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1191   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1192
1193   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1194     {
1195       /* For interleaved access we peel only if number of iterations in
1196          the prolog loop ({VF - misalignment}), is a multiple of the
1197          number of the interleaved accesses.  */
1198       int elem_size, mis_in_elements;
1199
1200       /* FORNOW: handle only known alignment.  */
1201       if (!known_alignment_for_access_p (dr))
1202         return false;
1203
1204       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1205       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1206       elem_size = vector_element_size (vector_size, nelements);
1207       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1208
1209       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1210         return false;
1211     }
1212
1213   /* If misalignment is known at the compile time then allow peeling
1214      only if natural alignment is reachable through peeling.  */
1215   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1216     {
1217       HOST_WIDE_INT elmsize =
1218                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1219       if (dump_enabled_p ())
1220         {
1221           dump_printf_loc (MSG_NOTE, vect_location,
1222                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1223           dump_printf (MSG_NOTE,
1224                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1225         }
1226       if (DR_MISALIGNMENT (dr) % elmsize)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "data size does not divide the misalignment.\n");
1231           return false;
1232         }
1233     }
1234
1235   if (!known_alignment_for_access_p (dr))
1236     {
1237       tree type = TREE_TYPE (DR_REF (dr));
1238       bool is_packed = not_size_aligned (DR_REF (dr));
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241                          "Unknown misalignment, %snaturally aligned\n",
1242                          is_packed ? "not " : "");
1243       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1244     }
1245
1246   return true;
1247 }
1248
1249
1250 /* Calculate the cost of the memory access represented by DR.  */
1251
1252 static void
1253 vect_get_data_access_cost (struct data_reference *dr,
1254                            unsigned int *inside_cost,
1255                            unsigned int *outside_cost,
1256                            stmt_vector_for_cost *body_cost_vec)
1257 {
1258   gimple *stmt = DR_STMT (dr);
1259   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1260   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1261   int ncopies;
1262
1263   if (PURE_SLP_STMT (stmt_info))
1264     ncopies = 1;
1265   else
1266     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1267
1268   if (DR_IS_READ (dr))
1269     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1270                         NULL, body_cost_vec, false);
1271   else
1272     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1273
1274   if (dump_enabled_p ())
1275     dump_printf_loc (MSG_NOTE, vect_location,
1276                      "vect_get_data_access_cost: inside_cost = %d, "
1277                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1278 }
1279
1280
1281 typedef struct _vect_peel_info
1282 {
1283   struct data_reference *dr;
1284   int npeel;
1285   unsigned int count;
1286 } *vect_peel_info;
1287
1288 typedef struct _vect_peel_extended_info
1289 {
1290   struct _vect_peel_info peel_info;
1291   unsigned int inside_cost;
1292   unsigned int outside_cost;
1293 } *vect_peel_extended_info;
1294
1295
1296 /* Peeling hashtable helpers.  */
1297
1298 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1299 {
1300   static inline hashval_t hash (const _vect_peel_info *);
1301   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1302 };
1303
1304 inline hashval_t
1305 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1306 {
1307   return (hashval_t) peel_info->npeel;
1308 }
1309
1310 inline bool
1311 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1312 {
1313   return (a->npeel == b->npeel);
1314 }
1315
1316
1317 /* Insert DR into peeling hash table with NPEEL as key.  */
1318
1319 static void
1320 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1321                           loop_vec_info loop_vinfo, struct data_reference *dr,
1322                           int npeel)
1323 {
1324   struct _vect_peel_info elem, *slot;
1325   _vect_peel_info **new_slot;
1326   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1327
1328   elem.npeel = npeel;
1329   slot = peeling_htab->find (&elem);
1330   if (slot)
1331     slot->count++;
1332   else
1333     {
1334       slot = XNEW (struct _vect_peel_info);
1335       slot->npeel = npeel;
1336       slot->dr = dr;
1337       slot->count = 1;
1338       new_slot = peeling_htab->find_slot (slot, INSERT);
1339       *new_slot = slot;
1340     }
1341
1342   if (!supportable_dr_alignment
1343       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1344     slot->count += VECT_MAX_COST;
1345 }
1346
1347
1348 /* Traverse peeling hash table to find peeling option that aligns maximum
1349    number of data accesses.  */
1350
1351 int
1352 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1353                                      _vect_peel_extended_info *max)
1354 {
1355   vect_peel_info elem = *slot;
1356
1357   if (elem->count > max->peel_info.count
1358       || (elem->count == max->peel_info.count
1359           && max->peel_info.npeel > elem->npeel))
1360     {
1361       max->peel_info.npeel = elem->npeel;
1362       max->peel_info.count = elem->count;
1363       max->peel_info.dr = elem->dr;
1364     }
1365
1366   return 1;
1367 }
1368
1369 /* Get the costs of peeling NPEEL iterations checking data access costs
1370    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1371    misalignment will be zero after peeling.  */
1372
1373 static void
1374 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1375                                 struct data_reference *dr0,
1376                                 unsigned int *inside_cost,
1377                                 unsigned int *outside_cost,
1378                                 stmt_vector_for_cost *body_cost_vec,
1379                                 unsigned int npeel,
1380                                 bool unknown_misalignment)
1381 {
1382   unsigned i;
1383   data_reference *dr;
1384
1385   FOR_EACH_VEC_ELT (datarefs, i, dr)
1386     {
1387       gimple *stmt = DR_STMT (dr);
1388       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1389       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1390         continue;
1391
1392       /* For interleaving, only the alignment of the first access
1393          matters.  */
1394       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1395           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1396         continue;
1397
1398       /* Strided accesses perform only component accesses, alignment is
1399          irrelevant for them.  */
1400       if (STMT_VINFO_STRIDED_P (stmt_info)
1401           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402         continue;
1403
1404       int save_misalignment;
1405       save_misalignment = DR_MISALIGNMENT (dr);
1406       if (npeel == 0)
1407         ;
1408       else if (unknown_misalignment && dr == dr0)
1409         SET_DR_MISALIGNMENT (dr, 0);
1410       else
1411         vect_update_misalignment_for_peel (dr, dr0, npeel);
1412       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1413                                  body_cost_vec);
1414       SET_DR_MISALIGNMENT (dr, save_misalignment);
1415     }
1416 }
1417
1418 /* Traverse peeling hash table and calculate cost for each peeling option.
1419    Find the one with the lowest cost.  */
1420
1421 int
1422 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1423                                    _vect_peel_extended_info *min)
1424 {
1425   vect_peel_info elem = *slot;
1426   int dummy;
1427   unsigned int inside_cost = 0, outside_cost = 0;
1428   gimple *stmt = DR_STMT (elem->dr);
1429   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1430   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1431   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1432                        epilogue_cost_vec;
1433
1434   prologue_cost_vec.create (2);
1435   body_cost_vec.create (2);
1436   epilogue_cost_vec.create (2);
1437
1438   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1439                                   elem->dr, &inside_cost, &outside_cost,
1440                                   &body_cost_vec, elem->npeel, false);
1441
1442   body_cost_vec.release ();
1443
1444   outside_cost += vect_get_known_peeling_cost
1445     (loop_vinfo, elem->npeel, &dummy,
1446      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1447      &prologue_cost_vec, &epilogue_cost_vec);
1448
1449   /* Prologue and epilogue costs are added to the target model later.
1450      These costs depend only on the scalar iteration cost, the
1451      number of peeling iterations finally chosen, and the number of
1452      misaligned statements.  So discard the information found here.  */
1453   prologue_cost_vec.release ();
1454   epilogue_cost_vec.release ();
1455
1456   if (inside_cost < min->inside_cost
1457       || (inside_cost == min->inside_cost
1458           && outside_cost < min->outside_cost))
1459     {
1460       min->inside_cost = inside_cost;
1461       min->outside_cost = outside_cost;
1462       min->peel_info.dr = elem->dr;
1463       min->peel_info.npeel = elem->npeel;
1464       min->peel_info.count = elem->count;
1465     }
1466
1467   return 1;
1468 }
1469
1470
1471 /* Choose best peeling option by traversing peeling hash table and either
1472    choosing an option with the lowest cost (if cost model is enabled) or the
1473    option that aligns as many accesses as possible.  */
1474
1475 static struct _vect_peel_extended_info
1476 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1477                                        loop_vec_info loop_vinfo)
1478 {
1479    struct _vect_peel_extended_info res;
1480
1481    res.peel_info.dr = NULL;
1482
1483    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1484      {
1485        res.inside_cost = INT_MAX;
1486        res.outside_cost = INT_MAX;
1487        peeling_htab->traverse <_vect_peel_extended_info *,
1488                                vect_peeling_hash_get_lowest_cost> (&res);
1489      }
1490    else
1491      {
1492        res.peel_info.count = 0;
1493        peeling_htab->traverse <_vect_peel_extended_info *,
1494                                vect_peeling_hash_get_most_frequent> (&res);
1495        res.inside_cost = 0;
1496        res.outside_cost = 0;
1497      }
1498
1499    return res;
1500 }
1501
1502 /* Return true if the new peeling NPEEL is supported.  */
1503
1504 static bool
1505 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1506                           unsigned npeel)
1507 {
1508   unsigned i;
1509   struct data_reference *dr = NULL;
1510   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1511   gimple *stmt;
1512   stmt_vec_info stmt_info;
1513   enum dr_alignment_support supportable_dr_alignment;
1514
1515   /* Ensure that all data refs can be vectorized after the peel.  */
1516   FOR_EACH_VEC_ELT (datarefs, i, dr)
1517     {
1518       int save_misalignment;
1519
1520       if (dr == dr0)
1521         continue;
1522
1523       stmt = DR_STMT (dr);
1524       stmt_info = vinfo_for_stmt (stmt);
1525       /* For interleaving, only the alignment of the first access
1526          matters.  */
1527       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1528           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1529         continue;
1530
1531       /* Strided accesses perform only component accesses, alignment is
1532          irrelevant for them.  */
1533       if (STMT_VINFO_STRIDED_P (stmt_info)
1534           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1535         continue;
1536
1537       save_misalignment = DR_MISALIGNMENT (dr);
1538       vect_update_misalignment_for_peel (dr, dr0, npeel);
1539       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1540       SET_DR_MISALIGNMENT (dr, save_misalignment);
1541
1542       if (!supportable_dr_alignment)
1543         return false;
1544     }
1545
1546   return true;
1547 }
1548
1549 /* Function vect_enhance_data_refs_alignment
1550
1551    This pass will use loop versioning and loop peeling in order to enhance
1552    the alignment of data references in the loop.
1553
1554    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1555    original loop is to be vectorized.  Any other loops that are created by
1556    the transformations performed in this pass - are not supposed to be
1557    vectorized.  This restriction will be relaxed.
1558
1559    This pass will require a cost model to guide it whether to apply peeling
1560    or versioning or a combination of the two.  For example, the scheme that
1561    intel uses when given a loop with several memory accesses, is as follows:
1562    choose one memory access ('p') which alignment you want to force by doing
1563    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1564    other accesses are not necessarily aligned, or (2) use loop versioning to
1565    generate one loop in which all accesses are aligned, and another loop in
1566    which only 'p' is necessarily aligned.
1567
1568    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1569    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1570    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1571
1572    Devising a cost model is the most critical aspect of this work.  It will
1573    guide us on which access to peel for, whether to use loop versioning, how
1574    many versions to create, etc.  The cost model will probably consist of
1575    generic considerations as well as target specific considerations (on
1576    powerpc for example, misaligned stores are more painful than misaligned
1577    loads).
1578
1579    Here are the general steps involved in alignment enhancements:
1580
1581      -- original loop, before alignment analysis:
1582         for (i=0; i<N; i++){
1583           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1584           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1585         }
1586
1587      -- After vect_compute_data_refs_alignment:
1588         for (i=0; i<N; i++){
1589           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1590           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1591         }
1592
1593      -- Possibility 1: we do loop versioning:
1594      if (p is aligned) {
1595         for (i=0; i<N; i++){    # loop 1A
1596           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1597           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1598         }
1599      }
1600      else {
1601         for (i=0; i<N; i++){    # loop 1B
1602           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1603           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1604         }
1605      }
1606
1607      -- Possibility 2: we do loop peeling:
1608      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1609         x = q[i];
1610         p[i] = y;
1611      }
1612      for (i = 3; i < N; i++){   # loop 2A
1613         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1614         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1615      }
1616
1617      -- Possibility 3: combination of loop peeling and versioning:
1618      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1619         x = q[i];
1620         p[i] = y;
1621      }
1622      if (p is aligned) {
1623         for (i = 3; i<N; i++){  # loop 3A
1624           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1625           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1626         }
1627      }
1628      else {
1629         for (i = 3; i<N; i++){  # loop 3B
1630           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1631           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1632         }
1633      }
1634
1635      These loops are later passed to loop_transform to be vectorized.  The
1636      vectorizer will use the alignment information to guide the transformation
1637      (whether to generate regular loads/stores, or with special handling for
1638      misalignment).  */
1639
1640 bool
1641 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1642 {
1643   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1644   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1645   enum dr_alignment_support supportable_dr_alignment;
1646   struct data_reference *dr0 = NULL, *first_store = NULL;
1647   struct data_reference *dr;
1648   unsigned int i, j;
1649   bool do_peeling = false;
1650   bool do_versioning = false;
1651   bool stat;
1652   gimple *stmt;
1653   stmt_vec_info stmt_info;
1654   unsigned int npeel = 0;
1655   bool one_misalignment_known = false;
1656   bool one_misalignment_unknown = false;
1657   bool one_dr_unsupportable = false;
1658   struct data_reference *unsupportable_dr = NULL;
1659   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1660   unsigned possible_npeel_number = 1;
1661   tree vectype;
1662   unsigned int mis, same_align_drs_max = 0;
1663   hash_table<peel_info_hasher> peeling_htab (1);
1664
1665   if (dump_enabled_p ())
1666     dump_printf_loc (MSG_NOTE, vect_location,
1667                      "=== vect_enhance_data_refs_alignment ===\n");
1668
1669   /* Reset data so we can safely be called multiple times.  */
1670   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1671   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1672
1673   /* While cost model enhancements are expected in the future, the high level
1674      view of the code at this time is as follows:
1675
1676      A) If there is a misaligned access then see if peeling to align
1677         this access can make all data references satisfy
1678         vect_supportable_dr_alignment.  If so, update data structures
1679         as needed and return true.
1680
1681      B) If peeling wasn't possible and there is a data reference with an
1682         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1683         then see if loop versioning checks can be used to make all data
1684         references satisfy vect_supportable_dr_alignment.  If so, update
1685         data structures as needed and return true.
1686
1687      C) If neither peeling nor versioning were successful then return false if
1688         any data reference does not satisfy vect_supportable_dr_alignment.
1689
1690      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1691
1692      Note, Possibility 3 above (which is peeling and versioning together) is not
1693      being done at this time.  */
1694
1695   /* (1) Peeling to force alignment.  */
1696
1697   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1698      Considerations:
1699      + How many accesses will become aligned due to the peeling
1700      - How many accesses will become unaligned due to the peeling,
1701        and the cost of misaligned accesses.
1702      - The cost of peeling (the extra runtime checks, the increase
1703        in code size).  */
1704
1705   FOR_EACH_VEC_ELT (datarefs, i, dr)
1706     {
1707       stmt = DR_STMT (dr);
1708       stmt_info = vinfo_for_stmt (stmt);
1709
1710       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1711         continue;
1712
1713       /* For interleaving, only the alignment of the first access
1714          matters.  */
1715       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1716           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1717         continue;
1718
1719       /* For invariant accesses there is nothing to enhance.  */
1720       if (integer_zerop (DR_STEP (dr)))
1721         continue;
1722
1723       /* Strided accesses perform only component accesses, alignment is
1724          irrelevant for them.  */
1725       if (STMT_VINFO_STRIDED_P (stmt_info)
1726           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1727         continue;
1728
1729       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1730       do_peeling = vector_alignment_reachable_p (dr);
1731       if (do_peeling)
1732         {
1733           if (known_alignment_for_access_p (dr))
1734             {
1735               unsigned int npeel_tmp = 0;
1736               bool negative = tree_int_cst_compare (DR_STEP (dr),
1737                                                     size_zero_node) < 0;
1738
1739               vectype = STMT_VINFO_VECTYPE (stmt_info);
1740               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1741               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1742               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1743               if (DR_MISALIGNMENT (dr) != 0)
1744                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1745
1746               /* For multiple types, it is possible that the bigger type access
1747                  will have more than one peeling option.  E.g., a loop with two
1748                  types: one of size (vector size / 4), and the other one of
1749                  size (vector size / 8).  Vectorization factor will 8.  If both
1750                  accesses are misaligned by 3, the first one needs one scalar
1751                  iteration to be aligned, and the second one needs 5.  But the
1752                  first one will be aligned also by peeling 5 scalar
1753                  iterations, and in that case both accesses will be aligned.
1754                  Hence, except for the immediate peeling amount, we also want
1755                  to try to add full vector size, while we don't exceed
1756                  vectorization factor.
1757                  We do this automatically for cost model, since we calculate
1758                  cost for every peeling option.  */
1759               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1760                 {
1761                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1762                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1763                   possible_npeel_number
1764                     = vect_get_num_vectors (nscalars, vectype);
1765
1766                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1767                      allow peeling NELEMENTS.  */
1768                   if (DR_MISALIGNMENT (dr) == 0)
1769                     possible_npeel_number++;
1770                 }
1771
1772               /* Save info about DR in the hash table.  Also include peeling
1773                  amounts according to the explanation above.  */
1774               for (j = 0; j < possible_npeel_number; j++)
1775                 {
1776                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1777                                             dr, npeel_tmp);
1778                   npeel_tmp += target_align / dr_size;
1779                 }
1780
1781               one_misalignment_known = true;
1782             }
1783           else
1784             {
1785               /* If we don't know any misalignment values, we prefer
1786                  peeling for data-ref that has the maximum number of data-refs
1787                  with the same alignment, unless the target prefers to align
1788                  stores over load.  */
1789               unsigned same_align_drs
1790                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1791               if (!dr0
1792                   || same_align_drs_max < same_align_drs)
1793                 {
1794                   same_align_drs_max = same_align_drs;
1795                   dr0 = dr;
1796                 }
1797               /* For data-refs with the same number of related
1798                  accesses prefer the one where the misalign
1799                  computation will be invariant in the outermost loop.  */
1800               else if (same_align_drs_max == same_align_drs)
1801                 {
1802                   struct loop *ivloop0, *ivloop;
1803                   ivloop0 = outermost_invariant_loop_for_expr
1804                     (loop, DR_BASE_ADDRESS (dr0));
1805                   ivloop = outermost_invariant_loop_for_expr
1806                     (loop, DR_BASE_ADDRESS (dr));
1807                   if ((ivloop && !ivloop0)
1808                       || (ivloop && ivloop0
1809                           && flow_loop_nested_p (ivloop, ivloop0)))
1810                     dr0 = dr;
1811                 }
1812
1813               one_misalignment_unknown = true;
1814
1815               /* Check for data refs with unsupportable alignment that
1816                  can be peeled.  */
1817               if (!supportable_dr_alignment)
1818               {
1819                 one_dr_unsupportable = true;
1820                 unsupportable_dr = dr;
1821               }
1822
1823               if (!first_store && DR_IS_WRITE (dr))
1824                 first_store = dr;
1825             }
1826         }
1827       else
1828         {
1829           if (!aligned_access_p (dr))
1830             {
1831               if (dump_enabled_p ())
1832                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1833                                  "vector alignment may not be reachable\n");
1834               break;
1835             }
1836         }
1837     }
1838
1839   /* Check if we can possibly peel the loop.  */
1840   if (!vect_can_advance_ivs_p (loop_vinfo)
1841       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1842       || loop->inner)
1843     do_peeling = false;
1844
1845   struct _vect_peel_extended_info peel_for_known_alignment;
1846   struct _vect_peel_extended_info peel_for_unknown_alignment;
1847   struct _vect_peel_extended_info best_peel;
1848
1849   peel_for_unknown_alignment.inside_cost = INT_MAX;
1850   peel_for_unknown_alignment.outside_cost = INT_MAX;
1851   peel_for_unknown_alignment.peel_info.count = 0;
1852
1853   if (do_peeling
1854       && one_misalignment_unknown)
1855     {
1856       /* Check if the target requires to prefer stores over loads, i.e., if
1857          misaligned stores are more expensive than misaligned loads (taking
1858          drs with same alignment into account).  */
1859       unsigned int load_inside_cost = 0;
1860       unsigned int load_outside_cost = 0;
1861       unsigned int store_inside_cost = 0;
1862       unsigned int store_outside_cost = 0;
1863       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1864
1865       stmt_vector_for_cost dummy;
1866       dummy.create (2);
1867       vect_get_peeling_costs_all_drs (datarefs, dr0,
1868                                       &load_inside_cost,
1869                                       &load_outside_cost,
1870                                       &dummy, estimated_npeels, true);
1871       dummy.release ();
1872
1873       if (first_store)
1874         {
1875           dummy.create (2);
1876           vect_get_peeling_costs_all_drs (datarefs, first_store,
1877                                           &store_inside_cost,
1878                                           &store_outside_cost,
1879                                           &dummy, estimated_npeels, true);
1880           dummy.release ();
1881         }
1882       else
1883         {
1884           store_inside_cost = INT_MAX;
1885           store_outside_cost = INT_MAX;
1886         }
1887
1888       if (load_inside_cost > store_inside_cost
1889           || (load_inside_cost == store_inside_cost
1890               && load_outside_cost > store_outside_cost))
1891         {
1892           dr0 = first_store;
1893           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1894           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1895         }
1896       else
1897         {
1898           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1899           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1900         }
1901
1902       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1903       prologue_cost_vec.create (2);
1904       epilogue_cost_vec.create (2);
1905
1906       int dummy2;
1907       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1908         (loop_vinfo, estimated_npeels, &dummy2,
1909          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1910          &prologue_cost_vec, &epilogue_cost_vec);
1911
1912       prologue_cost_vec.release ();
1913       epilogue_cost_vec.release ();
1914
1915       peel_for_unknown_alignment.peel_info.count = 1
1916         + STMT_VINFO_SAME_ALIGN_REFS
1917         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1918     }
1919
1920   peel_for_unknown_alignment.peel_info.npeel = 0;
1921   peel_for_unknown_alignment.peel_info.dr = dr0;
1922
1923   best_peel = peel_for_unknown_alignment;
1924
1925   peel_for_known_alignment.inside_cost = INT_MAX;
1926   peel_for_known_alignment.outside_cost = INT_MAX;
1927   peel_for_known_alignment.peel_info.count = 0;
1928   peel_for_known_alignment.peel_info.dr = NULL;
1929
1930   if (do_peeling && one_misalignment_known)
1931     {
1932       /* Peeling is possible, but there is no data access that is not supported
1933          unless aligned.  So we try to choose the best possible peeling from
1934          the hash table.  */
1935       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1936         (&peeling_htab, loop_vinfo);
1937     }
1938
1939   /* Compare costs of peeling for known and unknown alignment. */
1940   if (peel_for_known_alignment.peel_info.dr != NULL
1941       && peel_for_unknown_alignment.inside_cost
1942       >= peel_for_known_alignment.inside_cost)
1943     {
1944       best_peel = peel_for_known_alignment;
1945
1946       /* If the best peeling for known alignment has NPEEL == 0, perform no
1947          peeling at all except if there is an unsupportable dr that we can
1948          align.  */
1949       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1950         do_peeling = false;
1951     }
1952
1953   /* If there is an unsupportable data ref, prefer this over all choices so far
1954      since we'd have to discard a chosen peeling except when it accidentally
1955      aligned the unsupportable data ref.  */
1956   if (one_dr_unsupportable)
1957     dr0 = unsupportable_dr;
1958   else if (do_peeling)
1959     {
1960       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1961          TODO: Use nopeel_outside_cost or get rid of it?  */
1962       unsigned nopeel_inside_cost = 0;
1963       unsigned nopeel_outside_cost = 0;
1964
1965       stmt_vector_for_cost dummy;
1966       dummy.create (2);
1967       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1968                                       &nopeel_outside_cost, &dummy, 0, false);
1969       dummy.release ();
1970
1971       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1972          costs will be recorded.  */
1973       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1974       prologue_cost_vec.create (2);
1975       epilogue_cost_vec.create (2);
1976
1977       int dummy2;
1978       nopeel_outside_cost += vect_get_known_peeling_cost
1979         (loop_vinfo, 0, &dummy2,
1980          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1981          &prologue_cost_vec, &epilogue_cost_vec);
1982
1983       prologue_cost_vec.release ();
1984       epilogue_cost_vec.release ();
1985
1986       npeel = best_peel.peel_info.npeel;
1987       dr0 = best_peel.peel_info.dr;
1988
1989       /* If no peeling is not more expensive than the best peeling we
1990          have so far, don't perform any peeling.  */
1991       if (nopeel_inside_cost <= best_peel.inside_cost)
1992         do_peeling = false;
1993     }
1994
1995   if (do_peeling)
1996     {
1997       stmt = DR_STMT (dr0);
1998       stmt_info = vinfo_for_stmt (stmt);
1999       vectype = STMT_VINFO_VECTYPE (stmt_info);
2000
2001       if (known_alignment_for_access_p (dr0))
2002         {
2003           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2004                                                 size_zero_node) < 0;
2005           if (!npeel)
2006             {
2007               /* Since it's known at compile time, compute the number of
2008                  iterations in the peeled loop (the peeling factor) for use in
2009                  updating DR_MISALIGNMENT values.  The peeling factor is the
2010                  vectorization factor minus the misalignment as an element
2011                  count.  */
2012               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2013               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2014               npeel = ((mis & (target_align - 1))
2015                        / vect_get_scalar_dr_size (dr0));
2016             }
2017
2018           /* For interleaved data access every iteration accesses all the
2019              members of the group, therefore we divide the number of iterations
2020              by the group size.  */
2021           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2022           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2023             npeel /= GROUP_SIZE (stmt_info);
2024
2025           if (dump_enabled_p ())
2026             dump_printf_loc (MSG_NOTE, vect_location,
2027                              "Try peeling by %d\n", npeel);
2028         }
2029
2030       /* Ensure that all datarefs can be vectorized after the peel.  */
2031       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2032         do_peeling = false;
2033
2034       /* Check if all datarefs are supportable and log.  */
2035       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2036         {
2037           stat = vect_verify_datarefs_alignment (loop_vinfo);
2038           if (!stat)
2039             do_peeling = false;
2040           else
2041             return stat;
2042         }
2043
2044       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2045       if (do_peeling)
2046         {
2047           unsigned max_allowed_peel
2048             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2049           if (max_allowed_peel != (unsigned)-1)
2050             {
2051               unsigned max_peel = npeel;
2052               if (max_peel == 0)
2053                 {
2054                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2055                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2056                 }
2057               if (max_peel > max_allowed_peel)
2058                 {
2059                   do_peeling = false;
2060                   if (dump_enabled_p ())
2061                     dump_printf_loc (MSG_NOTE, vect_location,
2062                         "Disable peeling, max peels reached: %d\n", max_peel);
2063                 }
2064             }
2065         }
2066
2067       /* Cost model #2 - if peeling may result in a remaining loop not
2068          iterating enough to be vectorized then do not peel.  Since this
2069          is a cost heuristic rather than a correctness decision, use the
2070          most likely runtime value for variable vectorization factors.  */
2071       if (do_peeling
2072           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2073         {
2074           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2075           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2076           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2077               < assumed_vf + max_peel)
2078             do_peeling = false;
2079         }
2080
2081       if (do_peeling)
2082         {
2083           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2084              If the misalignment of DR_i is identical to that of dr0 then set
2085              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2086              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2087              by the peeling factor times the element size of DR_i (MOD the
2088              vectorization factor times the size).  Otherwise, the
2089              misalignment of DR_i must be set to unknown.  */
2090           FOR_EACH_VEC_ELT (datarefs, i, dr)
2091             if (dr != dr0)
2092               {
2093                 /* Strided accesses perform only component accesses, alignment
2094                    is irrelevant for them.  */
2095                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2096                 if (STMT_VINFO_STRIDED_P (stmt_info)
2097                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098                   continue;
2099
2100                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2101               }
2102
2103           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2104           if (npeel)
2105             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2106           else
2107             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108               = DR_MISALIGNMENT (dr0);
2109           SET_DR_MISALIGNMENT (dr0, 0);
2110           if (dump_enabled_p ())
2111             {
2112               dump_printf_loc (MSG_NOTE, vect_location,
2113                                "Alignment of access forced using peeling.\n");
2114               dump_printf_loc (MSG_NOTE, vect_location,
2115                                "Peeling for alignment will be applied.\n");
2116             }
2117
2118           /* The inside-loop cost will be accounted for in vectorizable_load
2119              and vectorizable_store correctly with adjusted alignments.
2120              Drop the body_cst_vec on the floor here.  */
2121           stat = vect_verify_datarefs_alignment (loop_vinfo);
2122           gcc_assert (stat);
2123           return stat;
2124         }
2125     }
2126
2127   /* (2) Versioning to force alignment.  */
2128
2129   /* Try versioning if:
2130      1) optimize loop for speed
2131      2) there is at least one unsupported misaligned data ref with an unknown
2132         misalignment, and
2133      3) all misaligned data refs with a known misalignment are supported, and
2134      4) the number of runtime alignment checks is within reason.  */
2135
2136   do_versioning =
2137         optimize_loop_nest_for_speed_p (loop)
2138         && (!loop->inner); /* FORNOW */
2139
2140   if (do_versioning)
2141     {
2142       FOR_EACH_VEC_ELT (datarefs, i, dr)
2143         {
2144           stmt = DR_STMT (dr);
2145           stmt_info = vinfo_for_stmt (stmt);
2146
2147           /* For interleaving, only the alignment of the first access
2148              matters.  */
2149           if (aligned_access_p (dr)
2150               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2151                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2152             continue;
2153
2154           if (STMT_VINFO_STRIDED_P (stmt_info))
2155             {
2156               /* Strided loads perform only component accesses, alignment is
2157                  irrelevant for them.  */
2158               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2159                 continue;
2160               do_versioning = false;
2161               break;
2162             }
2163
2164           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2165
2166           if (!supportable_dr_alignment)
2167             {
2168               gimple *stmt;
2169               int mask;
2170               tree vectype;
2171
2172               if (known_alignment_for_access_p (dr)
2173                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2174                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2175                 {
2176                   do_versioning = false;
2177                   break;
2178                 }
2179
2180               stmt = DR_STMT (dr);
2181               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2182               gcc_assert (vectype);
2183
2184               /* At present we don't support versioning for alignment
2185                  with variable VF, since there's no guarantee that the
2186                  VF is a power of two.  We could relax this if we added
2187                  a way of enforcing a power-of-two size.  */
2188               unsigned HOST_WIDE_INT size;
2189               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2190                 {
2191                   do_versioning = false;
2192                   break;
2193                 }
2194
2195               /* The rightmost bits of an aligned address must be zeros.
2196                  Construct the mask needed for this test.  For example,
2197                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2198                  mask must be 15 = 0xf. */
2199               mask = size - 1;
2200
2201               /* FORNOW: use the same mask to test all potentially unaligned
2202                  references in the loop.  The vectorizer currently supports
2203                  a single vector size, see the reference to
2204                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2205                  vectorization factor is computed.  */
2206               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2207                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2208               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2209               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2210                       DR_STMT (dr));
2211             }
2212         }
2213
2214       /* Versioning requires at least one misaligned data reference.  */
2215       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2216         do_versioning = false;
2217       else if (!do_versioning)
2218         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2219     }
2220
2221   if (do_versioning)
2222     {
2223       vec<gimple *> may_misalign_stmts
2224         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2225       gimple *stmt;
2226
2227       /* It can now be assumed that the data references in the statements
2228          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2229          of the loop being vectorized.  */
2230       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2231         {
2232           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2233           dr = STMT_VINFO_DATA_REF (stmt_info);
2234           SET_DR_MISALIGNMENT (dr, 0);
2235           if (dump_enabled_p ())
2236             dump_printf_loc (MSG_NOTE, vect_location,
2237                              "Alignment of access forced using versioning.\n");
2238         }
2239
2240       if (dump_enabled_p ())
2241         dump_printf_loc (MSG_NOTE, vect_location,
2242                          "Versioning for alignment will be applied.\n");
2243
2244       /* Peeling and versioning can't be done together at this time.  */
2245       gcc_assert (! (do_peeling && do_versioning));
2246
2247       stat = vect_verify_datarefs_alignment (loop_vinfo);
2248       gcc_assert (stat);
2249       return stat;
2250     }
2251
2252   /* This point is reached if neither peeling nor versioning is being done.  */
2253   gcc_assert (! (do_peeling || do_versioning));
2254
2255   stat = vect_verify_datarefs_alignment (loop_vinfo);
2256   return stat;
2257 }
2258
2259
2260 /* Function vect_find_same_alignment_drs.
2261
2262    Update group and alignment relations according to the chosen
2263    vectorization factor.  */
2264
2265 static void
2266 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2267 {
2268   struct data_reference *dra = DDR_A (ddr);
2269   struct data_reference *drb = DDR_B (ddr);
2270   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2271   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2272
2273   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2274     return;
2275
2276   if (dra == drb)
2277     return;
2278
2279   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2280       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2281       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2282     return;
2283
2284   /* Two references with distance zero have the same alignment.  */
2285   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2286                           - wi::to_poly_offset (DR_INIT (drb)));
2287   if (maybe_ne (diff, 0))
2288     {
2289       /* Get the wider of the two alignments.  */
2290       unsigned int align_a = (vect_calculate_target_alignment (dra)
2291                               / BITS_PER_UNIT);
2292       unsigned int align_b = (vect_calculate_target_alignment (drb)
2293                               / BITS_PER_UNIT);
2294       unsigned int max_align = MAX (align_a, align_b);
2295
2296       /* Require the gap to be a multiple of the larger vector alignment.  */
2297       if (!multiple_p (diff, max_align))
2298         return;
2299     }
2300
2301   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2302   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2303   if (dump_enabled_p ())
2304     {
2305       dump_printf_loc (MSG_NOTE, vect_location,
2306                        "accesses have the same alignment: ");
2307       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2308       dump_printf (MSG_NOTE,  " and ");
2309       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2310       dump_printf (MSG_NOTE, "\n");
2311     }
2312 }
2313
2314
2315 /* Function vect_analyze_data_refs_alignment
2316
2317    Analyze the alignment of the data-references in the loop.
2318    Return FALSE if a data reference is found that cannot be vectorized.  */
2319
2320 bool
2321 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2322 {
2323   if (dump_enabled_p ())
2324     dump_printf_loc (MSG_NOTE, vect_location,
2325                      "=== vect_analyze_data_refs_alignment ===\n");
2326
2327   /* Mark groups of data references with same alignment using
2328      data dependence information.  */
2329   vec<ddr_p> ddrs = vinfo->ddrs;
2330   struct data_dependence_relation *ddr;
2331   unsigned int i;
2332
2333   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2334     vect_find_same_alignment_drs (ddr);
2335
2336   vec<data_reference_p> datarefs = vinfo->datarefs;
2337   struct data_reference *dr;
2338
2339   vect_record_base_alignments (vinfo);
2340   FOR_EACH_VEC_ELT (datarefs, i, dr)
2341     {
2342       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2343       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2344           && !vect_compute_data_ref_alignment (dr))
2345         {
2346           /* Strided accesses perform only component accesses, misalignment
2347              information is irrelevant for them.  */
2348           if (STMT_VINFO_STRIDED_P (stmt_info)
2349               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2350             continue;
2351
2352           if (dump_enabled_p ())
2353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354                              "not vectorized: can't calculate alignment "
2355                              "for data ref.\n");
2356
2357           return false;
2358         }
2359     }
2360
2361   return true;
2362 }
2363
2364
2365 /* Analyze alignment of DRs of stmts in NODE.  */
2366
2367 static bool
2368 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2369 {
2370   /* We vectorize from the first scalar stmt in the node unless
2371      the node is permuted in which case we start from the first
2372      element in the group.  */
2373   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2374   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2375   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2376     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2377
2378   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2379   if (! vect_compute_data_ref_alignment (dr)
2380       /* For creating the data-ref pointer we need alignment of the
2381          first element anyway.  */
2382       || (dr != first_dr
2383           && ! vect_compute_data_ref_alignment (first_dr))
2384       || ! verify_data_ref_alignment (dr))
2385     {
2386       if (dump_enabled_p ())
2387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388                          "not vectorized: bad data alignment in basic "
2389                          "block.\n");
2390       return false;
2391     }
2392
2393   return true;
2394 }
2395
2396 /* Function vect_slp_analyze_instance_alignment
2397
2398    Analyze the alignment of the data-references in the SLP instance.
2399    Return FALSE if a data reference is found that cannot be vectorized.  */
2400
2401 bool
2402 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2403 {
2404   if (dump_enabled_p ())
2405     dump_printf_loc (MSG_NOTE, vect_location,
2406                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2407
2408   slp_tree node;
2409   unsigned i;
2410   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2411     if (! vect_slp_analyze_and_verify_node_alignment (node))
2412       return false;
2413
2414   node = SLP_INSTANCE_TREE (instance);
2415   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2416       && ! vect_slp_analyze_and_verify_node_alignment
2417              (SLP_INSTANCE_TREE (instance)))
2418     return false;
2419
2420   return true;
2421 }
2422
2423
2424 /* Analyze groups of accesses: check that DR belongs to a group of
2425    accesses of legal size, step, etc.  Detect gaps, single element
2426    interleaving, and other special cases. Set grouped access info.
2427    Collect groups of strided stores for further use in SLP analysis.
2428    Worker for vect_analyze_group_access.  */
2429
2430 static bool
2431 vect_analyze_group_access_1 (struct data_reference *dr)
2432 {
2433   tree step = DR_STEP (dr);
2434   tree scalar_type = TREE_TYPE (DR_REF (dr));
2435   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2436   gimple *stmt = DR_STMT (dr);
2437   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2438   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2439   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2440   HOST_WIDE_INT dr_step = -1;
2441   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2442   bool slp_impossible = false;
2443
2444   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2445      size of the interleaving group (including gaps).  */
2446   if (tree_fits_shwi_p (step))
2447     {
2448       dr_step = tree_to_shwi (step);
2449       /* Check that STEP is a multiple of type size.  Otherwise there is
2450          a non-element-sized gap at the end of the group which we
2451          cannot represent in GROUP_GAP or GROUP_SIZE.
2452          ???  As we can handle non-constant step fine here we should
2453          simply remove uses of GROUP_GAP between the last and first
2454          element and instead rely on DR_STEP.  GROUP_SIZE then would
2455          simply not include that gap.  */
2456       if ((dr_step % type_size) != 0)
2457         {
2458           if (dump_enabled_p ())
2459             {
2460               dump_printf_loc (MSG_NOTE, vect_location,
2461                                "Step ");
2462               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2463               dump_printf (MSG_NOTE,
2464                            " is not a multiple of the element size for ");
2465               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2466               dump_printf (MSG_NOTE, "\n");
2467             }
2468           return false;
2469         }
2470       groupsize = absu_hwi (dr_step) / type_size;
2471     }
2472   else
2473     groupsize = 0;
2474
2475   /* Not consecutive access is possible only if it is a part of interleaving.  */
2476   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2477     {
2478       /* Check if it this DR is a part of interleaving, and is a single
2479          element of the group that is accessed in the loop.  */
2480
2481       /* Gaps are supported only for loads. STEP must be a multiple of the type
2482          size.  */
2483       if (DR_IS_READ (dr)
2484           && (dr_step % type_size) == 0
2485           && groupsize > 0)
2486         {
2487           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2488           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2489           GROUP_GAP (stmt_info) = groupsize - 1;
2490           if (dump_enabled_p ())
2491             {
2492               dump_printf_loc (MSG_NOTE, vect_location,
2493                                "Detected single element interleaving ");
2494               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2495               dump_printf (MSG_NOTE, " step ");
2496               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2497               dump_printf (MSG_NOTE, "\n");
2498             }
2499
2500           return true;
2501         }
2502
2503       if (dump_enabled_p ())
2504         {
2505           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506                            "not consecutive access ");
2507           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2508         }
2509
2510       if (bb_vinfo)
2511         {
2512           /* Mark the statement as unvectorizable.  */
2513           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2514           return true;
2515         }
2516
2517       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2518       STMT_VINFO_STRIDED_P (stmt_info) = true;
2519       return true;
2520     }
2521
2522   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2523     {
2524       /* First stmt in the interleaving chain. Check the chain.  */
2525       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2526       struct data_reference *data_ref = dr;
2527       unsigned int count = 1;
2528       tree prev_init = DR_INIT (data_ref);
2529       gimple *prev = stmt;
2530       HOST_WIDE_INT diff, gaps = 0;
2531
2532       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2533       while (next)
2534         {
2535           /* Skip same data-refs.  In case that two or more stmts share
2536              data-ref (supported only for loads), we vectorize only the first
2537              stmt, and the rest get their vectorized loads from the first
2538              one.  */
2539           if (!tree_int_cst_compare (DR_INIT (data_ref),
2540                                      DR_INIT (STMT_VINFO_DATA_REF (
2541                                                    vinfo_for_stmt (next)))))
2542             {
2543               if (DR_IS_WRITE (data_ref))
2544                 {
2545                   if (dump_enabled_p ())
2546                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2547                                      "Two store stmts share the same dr.\n");
2548                   return false;
2549                 }
2550
2551               if (dump_enabled_p ())
2552                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553                                  "Two or more load stmts share the same dr.\n");
2554
2555               /* For load use the same data-ref load.  */
2556               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2557
2558               prev = next;
2559               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2560               continue;
2561             }
2562
2563           prev = next;
2564           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2565
2566           /* All group members have the same STEP by construction.  */
2567           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2568
2569           /* Check that the distance between two accesses is equal to the type
2570              size. Otherwise, we have gaps.  */
2571           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2572                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2573           if (diff != 1)
2574             {
2575               /* FORNOW: SLP of accesses with gaps is not supported.  */
2576               slp_impossible = true;
2577               if (DR_IS_WRITE (data_ref))
2578                 {
2579                   if (dump_enabled_p ())
2580                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2581                                      "interleaved store with gaps\n");
2582                   return false;
2583                 }
2584
2585               gaps += diff - 1;
2586             }
2587
2588           last_accessed_element += diff;
2589
2590           /* Store the gap from the previous member of the group. If there is no
2591              gap in the access, GROUP_GAP is always 1.  */
2592           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2593
2594           prev_init = DR_INIT (data_ref);
2595           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2596           /* Count the number of data-refs in the chain.  */
2597           count++;
2598         }
2599
2600       if (groupsize == 0)
2601         groupsize = count + gaps;
2602
2603       /* This could be UINT_MAX but as we are generating code in a very
2604          inefficient way we have to cap earlier.  See PR78699 for example.  */
2605       if (groupsize > 4096)
2606         {
2607           if (dump_enabled_p ())
2608             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609                              "group is too large\n");
2610           return false;
2611         }
2612
2613       /* Check that the size of the interleaving is equal to count for stores,
2614          i.e., that there are no gaps.  */
2615       if (groupsize != count
2616           && !DR_IS_READ (dr))
2617         {
2618           if (dump_enabled_p ())
2619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2620                              "interleaved store with gaps\n");
2621           return false;
2622         }
2623
2624       /* If there is a gap after the last load in the group it is the
2625          difference between the groupsize and the last accessed
2626          element.
2627          When there is no gap, this difference should be 0.  */
2628       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2629
2630       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2631       if (dump_enabled_p ())
2632         {
2633           dump_printf_loc (MSG_NOTE, vect_location,
2634                            "Detected interleaving ");
2635           if (DR_IS_READ (dr))
2636             dump_printf (MSG_NOTE, "load ");
2637           else
2638             dump_printf (MSG_NOTE, "store ");
2639           dump_printf (MSG_NOTE, "of size %u starting with ",
2640                        (unsigned)groupsize);
2641           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2642           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2643             dump_printf_loc (MSG_NOTE, vect_location,
2644                              "There is a gap of %u elements after the group\n",
2645                              GROUP_GAP (vinfo_for_stmt (stmt)));
2646         }
2647
2648       /* SLP: create an SLP data structure for every interleaving group of
2649          stores for further analysis in vect_analyse_slp.  */
2650       if (DR_IS_WRITE (dr) && !slp_impossible)
2651         {
2652           if (loop_vinfo)
2653             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2654           if (bb_vinfo)
2655             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2656         }
2657     }
2658
2659   return true;
2660 }
2661
2662 /* Analyze groups of accesses: check that DR belongs to a group of
2663    accesses of legal size, step, etc.  Detect gaps, single element
2664    interleaving, and other special cases. Set grouped access info.
2665    Collect groups of strided stores for further use in SLP analysis.  */
2666
2667 static bool
2668 vect_analyze_group_access (struct data_reference *dr)
2669 {
2670   if (!vect_analyze_group_access_1 (dr))
2671     {
2672       /* Dissolve the group if present.  */
2673       gimple *next;
2674       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2675       while (stmt)
2676         {
2677           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2678           next = GROUP_NEXT_ELEMENT (vinfo);
2679           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2680           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2681           stmt = next;
2682         }
2683       return false;
2684     }
2685   return true;
2686 }
2687
2688 /* Analyze the access pattern of the data-reference DR.
2689    In case of non-consecutive accesses call vect_analyze_group_access() to
2690    analyze groups of accesses.  */
2691
2692 static bool
2693 vect_analyze_data_ref_access (struct data_reference *dr)
2694 {
2695   tree step = DR_STEP (dr);
2696   tree scalar_type = TREE_TYPE (DR_REF (dr));
2697   gimple *stmt = DR_STMT (dr);
2698   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2699   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2700   struct loop *loop = NULL;
2701
2702   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2703     return true;
2704
2705   if (loop_vinfo)
2706     loop = LOOP_VINFO_LOOP (loop_vinfo);
2707
2708   if (loop_vinfo && !step)
2709     {
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712                          "bad data-ref access in loop\n");
2713       return false;
2714     }
2715
2716   /* Allow loads with zero step in inner-loop vectorization.  */
2717   if (loop_vinfo && integer_zerop (step))
2718     {
2719       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2720       if (!nested_in_vect_loop_p (loop, stmt))
2721         return DR_IS_READ (dr);
2722       /* Allow references with zero step for outer loops marked
2723          with pragma omp simd only - it guarantees absence of
2724          loop-carried dependencies between inner loop iterations.  */
2725       if (loop->safelen < 2)
2726         {
2727           if (dump_enabled_p ())
2728             dump_printf_loc (MSG_NOTE, vect_location,
2729                              "zero step in inner loop of nest\n");
2730           return false;
2731         }
2732     }
2733
2734   if (loop && nested_in_vect_loop_p (loop, stmt))
2735     {
2736       /* Interleaved accesses are not yet supported within outer-loop
2737         vectorization for references in the inner-loop.  */
2738       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2739
2740       /* For the rest of the analysis we use the outer-loop step.  */
2741       step = STMT_VINFO_DR_STEP (stmt_info);
2742       if (integer_zerop (step))
2743         {
2744           if (dump_enabled_p ())
2745             dump_printf_loc (MSG_NOTE, vect_location,
2746                              "zero step in outer loop.\n");
2747           return DR_IS_READ (dr);
2748         }
2749     }
2750
2751   /* Consecutive?  */
2752   if (TREE_CODE (step) == INTEGER_CST)
2753     {
2754       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2755       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2756           || (dr_step < 0
2757               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2758         {
2759           /* Mark that it is not interleaving.  */
2760           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2761           return true;
2762         }
2763     }
2764
2765   if (loop && nested_in_vect_loop_p (loop, stmt))
2766     {
2767       if (dump_enabled_p ())
2768         dump_printf_loc (MSG_NOTE, vect_location,
2769                          "grouped access in outer loop.\n");
2770       return false;
2771     }
2772
2773
2774   /* Assume this is a DR handled by non-constant strided load case.  */
2775   if (TREE_CODE (step) != INTEGER_CST)
2776     return (STMT_VINFO_STRIDED_P (stmt_info)
2777             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2778                 || vect_analyze_group_access (dr)));
2779
2780   /* Not consecutive access - check if it's a part of interleaving group.  */
2781   return vect_analyze_group_access (dr);
2782 }
2783
2784 /* Compare two data-references DRA and DRB to group them into chunks
2785    suitable for grouping.  */
2786
2787 static int
2788 dr_group_sort_cmp (const void *dra_, const void *drb_)
2789 {
2790   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2791   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2792   int cmp;
2793
2794   /* Stabilize sort.  */
2795   if (dra == drb)
2796     return 0;
2797
2798   /* DRs in different loops never belong to the same group.  */
2799   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2800   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2801   if (loopa != loopb)
2802     return loopa->num < loopb->num ? -1 : 1;
2803
2804   /* Ordering of DRs according to base.  */
2805   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2806                                DR_BASE_ADDRESS (drb));
2807   if (cmp != 0)
2808     return cmp;
2809
2810   /* And according to DR_OFFSET.  */
2811   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2812   if (cmp != 0)
2813     return cmp;
2814
2815   /* Put reads before writes.  */
2816   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2817     return DR_IS_READ (dra) ? -1 : 1;
2818
2819   /* Then sort after access size.  */
2820   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2821                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2822   if (cmp != 0)
2823     return cmp;
2824
2825   /* And after step.  */
2826   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2827   if (cmp != 0)
2828     return cmp;
2829
2830   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2831   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2832   if (cmp == 0)
2833     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2834   return cmp;
2835 }
2836
2837 /* If OP is the result of a conversion, return the unconverted value,
2838    otherwise return null.  */
2839
2840 static tree
2841 strip_conversion (tree op)
2842 {
2843   if (TREE_CODE (op) != SSA_NAME)
2844     return NULL_TREE;
2845   gimple *stmt = SSA_NAME_DEF_STMT (op);
2846   if (!is_gimple_assign (stmt)
2847       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2848     return NULL_TREE;
2849   return gimple_assign_rhs1 (stmt);
2850 }
2851
2852 /* Return true if vectorizable_* routines can handle statements STMT1
2853    and STMT2 being in a single group.  */
2854
2855 static bool
2856 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2857 {
2858   if (gimple_assign_single_p (stmt1))
2859     return gimple_assign_single_p (stmt2);
2860
2861   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2862     {
2863       /* Check for two masked loads or two masked stores.  */
2864       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2865         return false;
2866       internal_fn ifn = gimple_call_internal_fn (stmt1);
2867       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2868         return false;
2869       if (ifn != gimple_call_internal_fn (stmt2))
2870         return false;
2871
2872       /* Check that the masks are the same.  Cope with casts of masks,
2873          like those created by build_mask_conversion.  */
2874       tree mask1 = gimple_call_arg (stmt1, 2);
2875       tree mask2 = gimple_call_arg (stmt2, 2);
2876       if (!operand_equal_p (mask1, mask2, 0))
2877         {
2878           mask1 = strip_conversion (mask1);
2879           if (!mask1)
2880             return false;
2881           mask2 = strip_conversion (mask2);
2882           if (!mask2)
2883             return false;
2884           if (!operand_equal_p (mask1, mask2, 0))
2885             return false;
2886         }
2887       return true;
2888     }
2889
2890   return false;
2891 }
2892
2893 /* Function vect_analyze_data_ref_accesses.
2894
2895    Analyze the access pattern of all the data references in the loop.
2896
2897    FORNOW: the only access pattern that is considered vectorizable is a
2898            simple step 1 (consecutive) access.
2899
2900    FORNOW: handle only arrays and pointer accesses.  */
2901
2902 bool
2903 vect_analyze_data_ref_accesses (vec_info *vinfo)
2904 {
2905   unsigned int i;
2906   vec<data_reference_p> datarefs = vinfo->datarefs;
2907   struct data_reference *dr;
2908
2909   if (dump_enabled_p ())
2910     dump_printf_loc (MSG_NOTE, vect_location,
2911                      "=== vect_analyze_data_ref_accesses ===\n");
2912
2913   if (datarefs.is_empty ())
2914     return true;
2915
2916   /* Sort the array of datarefs to make building the interleaving chains
2917      linear.  Don't modify the original vector's order, it is needed for
2918      determining what dependencies are reversed.  */
2919   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2920   datarefs_copy.qsort (dr_group_sort_cmp);
2921
2922   /* Build the interleaving chains.  */
2923   for (i = 0; i < datarefs_copy.length () - 1;)
2924     {
2925       data_reference_p dra = datarefs_copy[i];
2926       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2927       stmt_vec_info lastinfo = NULL;
2928       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2929           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2930         {
2931           ++i;
2932           continue;
2933         }
2934       for (i = i + 1; i < datarefs_copy.length (); ++i)
2935         {
2936           data_reference_p drb = datarefs_copy[i];
2937           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2938           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2939               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2940             break;
2941
2942           /* ???  Imperfect sorting (non-compatible types, non-modulo
2943              accesses, same accesses) can lead to a group to be artificially
2944              split here as we don't just skip over those.  If it really
2945              matters we can push those to a worklist and re-iterate
2946              over them.  The we can just skip ahead to the next DR here.  */
2947
2948           /* DRs in a different loop should not be put into the same
2949              interleaving group.  */
2950           if (gimple_bb (DR_STMT (dra))->loop_father
2951               != gimple_bb (DR_STMT (drb))->loop_father)
2952             break;
2953
2954           /* Check that the data-refs have same first location (except init)
2955              and they are both either store or load (not load and store,
2956              not masked loads or stores).  */
2957           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2958               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2959                                         DR_BASE_ADDRESS (drb)) != 0
2960               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2961               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2962             break;
2963
2964           /* Check that the data-refs have the same constant size.  */
2965           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2966           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2967           if (!tree_fits_uhwi_p (sza)
2968               || !tree_fits_uhwi_p (szb)
2969               || !tree_int_cst_equal (sza, szb))
2970             break;
2971
2972           /* Check that the data-refs have the same step.  */
2973           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2974             break;
2975
2976           /* Check the types are compatible.
2977              ???  We don't distinguish this during sorting.  */
2978           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2979                                    TREE_TYPE (DR_REF (drb))))
2980             break;
2981
2982           /* Check that the DR_INITs are compile-time constants.  */
2983           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2984               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2985             break;
2986
2987           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2988           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2989           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2990           HOST_WIDE_INT init_prev
2991             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2992           gcc_assert (init_a <= init_b
2993                       && init_a <= init_prev
2994                       && init_prev <= init_b);
2995
2996           /* Do not place the same access in the interleaving chain twice.  */
2997           if (init_b == init_prev)
2998             {
2999               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3000                           < gimple_uid (DR_STMT (drb)));
3001               /* ???  For now we simply "drop" the later reference which is
3002                  otherwise the same rather than finishing off this group.
3003                  In the end we'd want to re-process duplicates forming
3004                  multiple groups from the refs, likely by just collecting
3005                  all candidates (including duplicates and split points
3006                  below) in a vector and then process them together.  */
3007               continue;
3008             }
3009
3010           /* If init_b == init_a + the size of the type * k, we have an
3011              interleaving, and DRA is accessed before DRB.  */
3012           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3013           if (type_size_a == 0
3014               || (init_b - init_a) % type_size_a != 0)
3015             break;
3016
3017           /* If we have a store, the accesses are adjacent.  This splits
3018              groups into chunks we support (we don't support vectorization
3019              of stores with gaps).  */
3020           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3021             break;
3022
3023           /* If the step (if not zero or non-constant) is greater than the
3024              difference between data-refs' inits this splits groups into
3025              suitable sizes.  */
3026           if (tree_fits_shwi_p (DR_STEP (dra)))
3027             {
3028               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3029               if (step != 0 && step <= (init_b - init_a))
3030                 break;
3031             }
3032
3033           if (dump_enabled_p ())
3034             {
3035               dump_printf_loc (MSG_NOTE, vect_location,
3036                                "Detected interleaving ");
3037               if (DR_IS_READ (dra))
3038                 dump_printf (MSG_NOTE, "load ");
3039               else
3040                 dump_printf (MSG_NOTE, "store ");
3041               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3042               dump_printf (MSG_NOTE,  " and ");
3043               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3044               dump_printf (MSG_NOTE, "\n");
3045             }
3046
3047           /* Link the found element into the group list.  */
3048           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3049             {
3050               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3051               lastinfo = stmtinfo_a;
3052             }
3053           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3054           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3055           lastinfo = stmtinfo_b;
3056         }
3057     }
3058
3059   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3060     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3061         && !vect_analyze_data_ref_access (dr))
3062       {
3063         if (dump_enabled_p ())
3064           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3065                            "not vectorized: complicated access pattern.\n");
3066
3067         if (is_a <bb_vec_info> (vinfo))
3068           {
3069             /* Mark the statement as not vectorizable.  */
3070             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3071             continue;
3072           }
3073         else
3074           {
3075             datarefs_copy.release ();
3076             return false;
3077           }
3078       }
3079
3080   datarefs_copy.release ();
3081   return true;
3082 }
3083
3084 /* Function vect_vfa_segment_size.
3085
3086    Input:
3087      DR: The data reference.
3088      LENGTH_FACTOR: segment length to consider.
3089
3090    Return a value suitable for the dr_with_seg_len::seg_len field.
3091    This is the "distance travelled" by the pointer from the first
3092    iteration in the segment to the last.  Note that it does not include
3093    the size of the access; in effect it only describes the first byte.  */
3094
3095 static tree
3096 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3097 {
3098   length_factor = size_binop (MINUS_EXPR,
3099                               fold_convert (sizetype, length_factor),
3100                               size_one_node);
3101   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3102                      length_factor);
3103 }
3104
3105 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3106    gives the worst-case number of bytes covered by the segment.  */
3107
3108 static unsigned HOST_WIDE_INT
3109 vect_vfa_access_size (data_reference *dr)
3110 {
3111   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3112   tree ref_type = TREE_TYPE (DR_REF (dr));
3113   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3114   unsigned HOST_WIDE_INT access_size = ref_size;
3115   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3116     {
3117       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3118       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3119     }
3120   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3121       && (vect_supportable_dr_alignment (dr, false)
3122           == dr_explicit_realign_optimized))
3123     {
3124       /* We might access a full vector's worth.  */
3125       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3126       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3127     }
3128   return access_size;
3129 }
3130
3131 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3132
3133 static unsigned int
3134 vect_vfa_align (const data_reference *dr)
3135 {
3136   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3137 }
3138
3139 /* Function vect_no_alias_p.
3140
3141    Given data references A and B with equal base and offset, see whether
3142    the alias relation can be decided at compilation time.  Return 1 if
3143    it can and the references alias, 0 if it can and the references do
3144    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3145    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3146    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3147
3148 static int
3149 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3150                          tree segment_length_a, tree segment_length_b,
3151                          unsigned HOST_WIDE_INT access_size_a,
3152                          unsigned HOST_WIDE_INT access_size_b)
3153 {
3154   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3155   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3156   poly_uint64 const_length_a;
3157   poly_uint64 const_length_b;
3158
3159   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3160      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3161      [a, a+12) */
3162   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3163     {
3164       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3165       offset_a = (offset_a + access_size_a) - const_length_a;
3166     }
3167   else
3168     const_length_a = tree_to_poly_uint64 (segment_length_a);
3169   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3170     {
3171       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3172       offset_b = (offset_b + access_size_b) - const_length_b;
3173     }
3174   else
3175     const_length_b = tree_to_poly_uint64 (segment_length_b);
3176
3177   const_length_a += access_size_a;
3178   const_length_b += access_size_b;
3179
3180   if (ranges_known_overlap_p (offset_a, const_length_a,
3181                               offset_b, const_length_b))
3182     return 1;
3183
3184   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3185                                offset_b, const_length_b))
3186     return 0;
3187
3188   return -1;
3189 }
3190
3191 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3192    in DDR is >= VF.  */
3193
3194 static bool
3195 dependence_distance_ge_vf (data_dependence_relation *ddr,
3196                            unsigned int loop_depth, poly_uint64 vf)
3197 {
3198   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3199       || DDR_NUM_DIST_VECTS (ddr) == 0)
3200     return false;
3201
3202   /* If the dependence is exact, we should have limited the VF instead.  */
3203   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3204
3205   unsigned int i;
3206   lambda_vector dist_v;
3207   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3208     {
3209       HOST_WIDE_INT dist = dist_v[loop_depth];
3210       if (dist != 0
3211           && !(dist > 0 && DDR_REVERSED_P (ddr))
3212           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3213         return false;
3214     }
3215
3216   if (dump_enabled_p ())
3217     {
3218       dump_printf_loc (MSG_NOTE, vect_location,
3219                        "dependence distance between ");
3220       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3221       dump_printf (MSG_NOTE,  " and ");
3222       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3223       dump_printf (MSG_NOTE,  " is >= VF\n");
3224     }
3225
3226   return true;
3227 }
3228
3229 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3230
3231 static void
3232 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3233 {
3234   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3235   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3236   dump_printf (dump_kind, ") >= ");
3237   dump_dec (dump_kind, lower_bound.min_value);
3238 }
3239
3240 /* Record that the vectorized loop requires the vec_lower_bound described
3241    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3242
3243 static void
3244 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3245                         poly_uint64 min_value)
3246 {
3247   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3248   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3249     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3250       {
3251         unsigned_p &= lower_bounds[i].unsigned_p;
3252         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3253         if (lower_bounds[i].unsigned_p != unsigned_p
3254             || maybe_lt (lower_bounds[i].min_value, min_value))
3255           {
3256             lower_bounds[i].unsigned_p = unsigned_p;
3257             lower_bounds[i].min_value = min_value;
3258             if (dump_enabled_p ())
3259               {
3260                 dump_printf_loc (MSG_NOTE, vect_location,
3261                                  "updating run-time check to ");
3262                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3263                 dump_printf (MSG_NOTE, "\n");
3264               }
3265           }
3266         return;
3267       }
3268
3269   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3270   if (dump_enabled_p ())
3271     {
3272       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3273       dump_lower_bound (MSG_NOTE, lower_bound);
3274       dump_printf (MSG_NOTE, "\n");
3275     }
3276   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3277 }
3278
3279 /* Return true if it's unlikely that the step of the vectorized form of DR
3280    will span fewer than GAP bytes.  */
3281
3282 static bool
3283 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3284 {
3285   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3286   HOST_WIDE_INT count
3287     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3288   if (GROUP_FIRST_ELEMENT (stmt_info))
3289     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3290   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3291 }
3292
3293 /* Return true if we know that there is no alias between DR_A and DR_B
3294    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3295    *LOWER_BOUND_OUT to this N.  */
3296
3297 static bool
3298 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3299                                 poly_uint64 *lower_bound_out)
3300 {
3301   /* Check that there is a constant gap of known sign between DR_A
3302      and DR_B.  */
3303   poly_int64 init_a, init_b;
3304   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3305       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3306       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3307       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3308       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3309       || !ordered_p (init_a, init_b))
3310     return false;
3311
3312   /* Sort DR_A and DR_B by the address they access.  */
3313   if (maybe_lt (init_b, init_a))
3314     {
3315       std::swap (init_a, init_b);
3316       std::swap (dr_a, dr_b);
3317     }
3318
3319   /* If the two accesses could be dependent within a scalar iteration,
3320      make sure that we'd retain their order.  */
3321   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3322       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3323     return false;
3324
3325   /* There is no alias if abs (DR_STEP) is greater than or equal to
3326      the bytes spanned by the combination of the two accesses.  */
3327   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3328   return true;
3329 }
3330
3331 /* Function vect_prune_runtime_alias_test_list.
3332
3333    Prune a list of ddrs to be tested at run-time by versioning for alias.
3334    Merge several alias checks into one if possible.
3335    Return FALSE if resulting list of ddrs is longer then allowed by
3336    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3337
3338 bool
3339 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3340 {
3341   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3342   hash_set <tree_pair_hash> compared_objects;
3343
3344   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3345   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3346     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3347   vec<vec_object_pair> &check_unequal_addrs
3348     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3349   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3350   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3351
3352   ddr_p ddr;
3353   unsigned int i;
3354   tree length_factor;
3355
3356   if (dump_enabled_p ())
3357     dump_printf_loc (MSG_NOTE, vect_location,
3358                      "=== vect_prune_runtime_alias_test_list ===\n");
3359
3360   /* Step values are irrelevant for aliasing if the number of vector
3361      iterations is equal to the number of scalar iterations (which can
3362      happen for fully-SLP loops).  */
3363   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3364
3365   if (!ignore_step_p)
3366     {
3367       /* Convert the checks for nonzero steps into bound tests.  */
3368       tree value;
3369       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3370         vect_check_lower_bound (loop_vinfo, value, true, 1);
3371     }
3372
3373   if (may_alias_ddrs.is_empty ())
3374     return true;
3375
3376   comp_alias_ddrs.create (may_alias_ddrs.length ());
3377
3378   unsigned int loop_depth
3379     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3380                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3381
3382   /* First, we collect all data ref pairs for aliasing checks.  */
3383   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3384     {
3385       int comp_res;
3386       poly_uint64 lower_bound;
3387       struct data_reference *dr_a, *dr_b;
3388       gimple *dr_group_first_a, *dr_group_first_b;
3389       tree segment_length_a, segment_length_b;
3390       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3391       unsigned int align_a, align_b;
3392       gimple *stmt_a, *stmt_b;
3393
3394       /* Ignore the alias if the VF we chose ended up being no greater
3395          than the dependence distance.  */
3396       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3397         continue;
3398
3399       if (DDR_OBJECT_A (ddr))
3400         {
3401           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3402           if (!compared_objects.add (new_pair))
3403             {
3404               if (dump_enabled_p ())
3405                 {
3406                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3407                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3408                   dump_printf (MSG_NOTE, " and ");
3409                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3410                   dump_printf (MSG_NOTE, " have different addresses\n");
3411                 }
3412               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3413             }
3414           continue;
3415         }
3416
3417       dr_a = DDR_A (ddr);
3418       stmt_a = DR_STMT (DDR_A (ddr));
3419
3420       dr_b = DDR_B (ddr);
3421       stmt_b = DR_STMT (DDR_B (ddr));
3422
3423       /* Skip the pair if inter-iteration dependencies are irrelevant
3424          and intra-iteration dependencies are guaranteed to be honored.  */
3425       if (ignore_step_p
3426           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3427               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3428         {
3429           if (dump_enabled_p ())
3430             {
3431               dump_printf_loc (MSG_NOTE, vect_location,
3432                                "no need for alias check between ");
3433               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3434               dump_printf (MSG_NOTE, " and ");
3435               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3436               dump_printf (MSG_NOTE, " when VF is 1\n");
3437             }
3438           continue;
3439         }
3440
3441       /* See whether we can handle the alias using a bounds check on
3442          the step, and whether that's likely to be the best approach.
3443          (It might not be, for example, if the minimum step is much larger
3444          than the number of bytes handled by one vector iteration.)  */
3445       if (!ignore_step_p
3446           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3447           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3448           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3449               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3450         {
3451           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3452           if (dump_enabled_p ())
3453             {
3454               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3455               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3456               dump_printf (MSG_NOTE, " and ");
3457               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3458               dump_printf (MSG_NOTE, " when the step ");
3459               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3460               dump_printf (MSG_NOTE, " is outside ");
3461               if (unsigned_p)
3462                 dump_printf (MSG_NOTE, "[0");
3463               else
3464                 {
3465                   dump_printf (MSG_NOTE, "(");
3466                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3467                 }
3468               dump_printf (MSG_NOTE, ", ");
3469               dump_dec (MSG_NOTE, lower_bound);
3470               dump_printf (MSG_NOTE, ")\n");
3471             }
3472           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3473                                   lower_bound);
3474           continue;
3475         }
3476
3477       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3478       if (dr_group_first_a)
3479         {
3480           stmt_a = dr_group_first_a;
3481           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3482         }
3483
3484       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3485       if (dr_group_first_b)
3486         {
3487           stmt_b = dr_group_first_b;
3488           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3489         }
3490
3491       if (ignore_step_p)
3492         {
3493           segment_length_a = size_zero_node;
3494           segment_length_b = size_zero_node;
3495         }
3496       else
3497         {
3498           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3499             length_factor = scalar_loop_iters;
3500           else
3501             length_factor = size_int (vect_factor);
3502           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3503           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3504         }
3505       access_size_a = vect_vfa_access_size (dr_a);
3506       access_size_b = vect_vfa_access_size (dr_b);
3507       align_a = vect_vfa_align (dr_a);
3508       align_b = vect_vfa_align (dr_b);
3509
3510       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3511                                         DR_BASE_ADDRESS (dr_b));
3512       if (comp_res == 0)
3513         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3514                                           DR_OFFSET (dr_b));
3515
3516       /* See whether the alias is known at compilation time.  */
3517       if (comp_res == 0
3518           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3519           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3520           && poly_int_tree_p (segment_length_a)
3521           && poly_int_tree_p (segment_length_b))
3522         {
3523           int res = vect_compile_time_alias (dr_a, dr_b,
3524                                              segment_length_a,
3525                                              segment_length_b,
3526                                              access_size_a,
3527                                              access_size_b);
3528           if (res >= 0 && dump_enabled_p ())
3529             {
3530               dump_printf_loc (MSG_NOTE, vect_location,
3531                                "can tell at compile time that ");
3532               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3533               dump_printf (MSG_NOTE, " and ");
3534               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3535               if (res == 0)
3536                 dump_printf (MSG_NOTE, " do not alias\n");
3537               else
3538                 dump_printf (MSG_NOTE, " alias\n");
3539             }
3540
3541           if (res == 0)
3542             continue;
3543
3544           if (res == 1)
3545             {
3546               if (dump_enabled_p ())
3547                 dump_printf_loc (MSG_NOTE, vect_location,
3548                                  "not vectorized: compilation time alias.\n");
3549               return false;
3550             }
3551         }
3552
3553       dr_with_seg_len_pair_t dr_with_seg_len_pair
3554         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3555          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3556
3557       /* Canonicalize pairs by sorting the two DR members.  */
3558       if (comp_res > 0)
3559         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3560
3561       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3562     }
3563
3564   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3565
3566   unsigned int count = (comp_alias_ddrs.length ()
3567                         + check_unequal_addrs.length ());
3568
3569   dump_printf_loc (MSG_NOTE, vect_location,
3570                    "improved number of alias checks from %d to %d\n",
3571                    may_alias_ddrs.length (), count);
3572   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3573     {
3574       if (dump_enabled_p ())
3575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3576                          "number of versioning for alias "
3577                          "run-time tests exceeds %d "
3578                          "(--param vect-max-version-for-alias-checks)\n",
3579                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3580       return false;
3581     }
3582
3583   return true;
3584 }
3585
3586 /* Check whether we can use an internal function for a gather load
3587    or scatter store.  READ_P is true for loads and false for stores.
3588    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3589    the type of the memory elements being loaded or stored.  OFFSET_BITS
3590    is the number of bits in each scalar offset and OFFSET_SIGN is the
3591    sign of the offset.  SCALE is the amount by which the offset should
3592    be multiplied *after* it has been converted to address width.
3593
3594    Return true if the function is supported, storing the function
3595    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3596
3597 bool
3598 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3599                           tree memory_type, unsigned int offset_bits,
3600                           signop offset_sign, int scale,
3601                           internal_fn *ifn_out, tree *element_type_out)
3602 {
3603   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3604   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3605   if (offset_bits > element_bits)
3606     /* Internal functions require the offset to be the same width as
3607        the vector elements.  We can extend narrower offsets, but it isn't
3608        safe to truncate wider offsets.  */
3609     return false;
3610
3611   if (element_bits != memory_bits)
3612     /* For now the vector elements must be the same width as the
3613        memory elements.  */
3614     return false;
3615
3616   /* Work out which function we need.  */
3617   internal_fn ifn;
3618   if (read_p)
3619     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3620   else
3621     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3622
3623   /* Test whether the target supports this combination.  */
3624   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3625                                                offset_sign, scale))
3626     return false;
3627
3628   *ifn_out = ifn;
3629   *element_type_out = TREE_TYPE (vectype);
3630   return true;
3631 }
3632
3633 /* CALL is a call to an internal gather load or scatter store function.
3634    Describe the operation in INFO.  */
3635
3636 static void
3637 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3638 {
3639   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3640   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3641   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3642
3643   info->ifn = gimple_call_internal_fn (call);
3644   info->decl = NULL_TREE;
3645   info->base = gimple_call_arg (call, 0);
3646   info->offset = gimple_call_arg (call, 1);
3647   info->offset_dt = vect_unknown_def_type;
3648   info->offset_vectype = NULL_TREE;
3649   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3650   info->element_type = TREE_TYPE (vectype);
3651   info->memory_type = TREE_TYPE (DR_REF (dr));
3652 }
3653
3654 /* Return true if a non-affine read or write in STMT is suitable for a
3655    gather load or scatter store.  Describe the operation in *INFO if so.  */
3656
3657 bool
3658 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3659                            gather_scatter_info *info)
3660 {
3661   HOST_WIDE_INT scale = 1;
3662   poly_int64 pbitpos, pbitsize;
3663   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3664   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3665   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3666   tree offtype = NULL_TREE;
3667   tree decl = NULL_TREE, base, off;
3668   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3669   tree memory_type = TREE_TYPE (DR_REF (dr));
3670   machine_mode pmode;
3671   int punsignedp, reversep, pvolatilep = 0;
3672   internal_fn ifn;
3673   tree element_type;
3674   bool masked_p = false;
3675
3676   /* See whether this is already a call to a gather/scatter internal function.
3677      If not, see whether it's a masked load or store.  */
3678   gcall *call = dyn_cast <gcall *> (stmt);
3679   if (call && gimple_call_internal_p (call))
3680     {
3681       ifn = gimple_call_internal_fn (stmt);
3682       if (internal_gather_scatter_fn_p (ifn))
3683         {
3684           vect_describe_gather_scatter_call (call, info);
3685           return true;
3686         }
3687       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3688     }
3689
3690   /* True if we should aim to use internal functions rather than
3691      built-in functions.  */
3692   bool use_ifn_p = (DR_IS_READ (dr)
3693                     ? supports_vec_gather_load_p ()
3694                     : supports_vec_scatter_store_p ());
3695
3696   base = DR_REF (dr);
3697   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3698      see if we can use the def stmt of the address.  */
3699   if (masked_p
3700       && TREE_CODE (base) == MEM_REF
3701       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3702       && integer_zerop (TREE_OPERAND (base, 1))
3703       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3704     {
3705       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3706       if (is_gimple_assign (def_stmt)
3707           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3708         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3709     }
3710
3711   /* The gather and scatter builtins need address of the form
3712      loop_invariant + vector * {1, 2, 4, 8}
3713      or
3714      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3715      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3716      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3717      multiplications and additions in it.  To get a vector, we need
3718      a single SSA_NAME that will be defined in the loop and will
3719      contain everything that is not loop invariant and that can be
3720      vectorized.  The following code attempts to find such a preexistng
3721      SSA_NAME OFF and put the loop invariants into a tree BASE
3722      that can be gimplified before the loop.  */
3723   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3724                               &punsignedp, &reversep, &pvolatilep);
3725   gcc_assert (base && !reversep);
3726   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3727
3728   if (TREE_CODE (base) == MEM_REF)
3729     {
3730       if (!integer_zerop (TREE_OPERAND (base, 1)))
3731         {
3732           if (off == NULL_TREE)
3733             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3734           else
3735             off = size_binop (PLUS_EXPR, off,
3736                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3737         }
3738       base = TREE_OPERAND (base, 0);
3739     }
3740   else
3741     base = build_fold_addr_expr (base);
3742
3743   if (off == NULL_TREE)
3744     off = size_zero_node;
3745
3746   /* If base is not loop invariant, either off is 0, then we start with just
3747      the constant offset in the loop invariant BASE and continue with base
3748      as OFF, otherwise give up.
3749      We could handle that case by gimplifying the addition of base + off
3750      into some SSA_NAME and use that as off, but for now punt.  */
3751   if (!expr_invariant_in_loop_p (loop, base))
3752     {
3753       if (!integer_zerop (off))
3754         return false;
3755       off = base;
3756       base = size_int (pbytepos);
3757     }
3758   /* Otherwise put base + constant offset into the loop invariant BASE
3759      and continue with OFF.  */
3760   else
3761     {
3762       base = fold_convert (sizetype, base);
3763       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3764     }
3765
3766   /* OFF at this point may be either a SSA_NAME or some tree expression
3767      from get_inner_reference.  Try to peel off loop invariants from it
3768      into BASE as long as possible.  */
3769   STRIP_NOPS (off);
3770   while (offtype == NULL_TREE)
3771     {
3772       enum tree_code code;
3773       tree op0, op1, add = NULL_TREE;
3774
3775       if (TREE_CODE (off) == SSA_NAME)
3776         {
3777           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3778
3779           if (expr_invariant_in_loop_p (loop, off))
3780             return false;
3781
3782           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3783             break;
3784
3785           op0 = gimple_assign_rhs1 (def_stmt);
3786           code = gimple_assign_rhs_code (def_stmt);
3787           op1 = gimple_assign_rhs2 (def_stmt);
3788         }
3789       else
3790         {
3791           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3792             return false;
3793           code = TREE_CODE (off);
3794           extract_ops_from_tree (off, &code, &op0, &op1);
3795         }
3796       switch (code)
3797         {
3798         case POINTER_PLUS_EXPR:
3799         case PLUS_EXPR:
3800           if (expr_invariant_in_loop_p (loop, op0))
3801             {
3802               add = op0;
3803               off = op1;
3804             do_add:
3805               add = fold_convert (sizetype, add);
3806               if (scale != 1)
3807                 add = size_binop (MULT_EXPR, add, size_int (scale));
3808               base = size_binop (PLUS_EXPR, base, add);
3809               continue;
3810             }
3811           if (expr_invariant_in_loop_p (loop, op1))
3812             {
3813               add = op1;
3814               off = op0;
3815               goto do_add;
3816             }
3817           break;
3818         case MINUS_EXPR:
3819           if (expr_invariant_in_loop_p (loop, op1))
3820             {
3821               add = fold_convert (sizetype, op1);
3822               add = size_binop (MINUS_EXPR, size_zero_node, add);
3823               off = op0;
3824               goto do_add;
3825             }
3826           break;
3827         case MULT_EXPR:
3828           if (scale == 1 && tree_fits_shwi_p (op1))
3829             {
3830               int new_scale = tree_to_shwi (op1);
3831               /* Only treat this as a scaling operation if the target
3832                  supports it.  */
3833               if (use_ifn_p
3834                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3835                                                 vectype, memory_type, 1,
3836                                                 TYPE_SIGN (TREE_TYPE (op0)),
3837                                                 new_scale, &ifn,
3838                                                 &element_type))
3839                 break;
3840               scale = new_scale;
3841               off = op0;
3842               continue;
3843             }
3844           break;
3845         case SSA_NAME:
3846           off = op0;
3847           continue;
3848         CASE_CONVERT:
3849           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3850               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3851             break;
3852           if (TYPE_PRECISION (TREE_TYPE (op0))
3853               == TYPE_PRECISION (TREE_TYPE (off)))
3854             {
3855               off = op0;
3856               continue;
3857             }
3858
3859           /* The internal functions need the offset to be the same width
3860              as the elements of VECTYPE.  Don't include operations that
3861              cast the offset from that width to a different width.  */
3862           if (use_ifn_p
3863               && (int_size_in_bytes (TREE_TYPE (vectype))
3864                   == int_size_in_bytes (TREE_TYPE (off))))
3865             break;
3866
3867           if (TYPE_PRECISION (TREE_TYPE (op0))
3868               < TYPE_PRECISION (TREE_TYPE (off)))
3869             {
3870               off = op0;
3871               offtype = TREE_TYPE (off);
3872               STRIP_NOPS (off);
3873               continue;
3874             }
3875           break;
3876         default:
3877           break;
3878         }
3879       break;
3880     }
3881
3882   /* If at the end OFF still isn't a SSA_NAME or isn't
3883      defined in the loop, punt.  */
3884   if (TREE_CODE (off) != SSA_NAME
3885       || expr_invariant_in_loop_p (loop, off))
3886     return false;
3887
3888   if (offtype == NULL_TREE)
3889     offtype = TREE_TYPE (off);
3890
3891   if (use_ifn_p)
3892     {
3893       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3894                                      memory_type, TYPE_PRECISION (offtype),
3895                                      TYPE_SIGN (offtype), scale, &ifn,
3896                                      &element_type))
3897         return false;
3898     }
3899   else
3900     {
3901       if (DR_IS_READ (dr))
3902         {
3903           if (targetm.vectorize.builtin_gather)
3904             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3905         }
3906       else
3907         {
3908           if (targetm.vectorize.builtin_scatter)
3909             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3910         }
3911
3912       if (!decl)
3913         return false;
3914
3915       ifn = IFN_LAST;
3916       element_type = TREE_TYPE (vectype);
3917     }
3918
3919   info->ifn = ifn;
3920   info->decl = decl;
3921   info->base = base;
3922   info->offset = off;
3923   info->offset_dt = vect_unknown_def_type;
3924   info->offset_vectype = NULL_TREE;
3925   info->scale = scale;
3926   info->element_type = element_type;
3927   info->memory_type = memory_type;
3928   return true;
3929 }
3930
3931 /* Function vect_analyze_data_refs.
3932
3933   Find all the data references in the loop or basic block.
3934
3935    The general structure of the analysis of data refs in the vectorizer is as
3936    follows:
3937    1- vect_analyze_data_refs(loop/bb): call
3938       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3939       in the loop/bb and their dependences.
3940    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3941    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3942    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3943
3944 */
3945
3946 bool
3947 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3948 {
3949   struct loop *loop = NULL;
3950   unsigned int i;
3951   struct data_reference *dr;
3952   tree scalar_type;
3953
3954   if (dump_enabled_p ())
3955     dump_printf_loc (MSG_NOTE, vect_location,
3956                      "=== vect_analyze_data_refs ===\n");
3957
3958   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3959     loop = LOOP_VINFO_LOOP (loop_vinfo);
3960
3961   /* Go through the data-refs, check that the analysis succeeded.  Update
3962      pointer from stmt_vec_info struct to DR and vectype.  */
3963
3964   vec<data_reference_p> datarefs = vinfo->datarefs;
3965   FOR_EACH_VEC_ELT (datarefs, i, dr)
3966     {
3967       gimple *stmt;
3968       stmt_vec_info stmt_info;
3969       tree base, offset, init;
3970       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3971       bool simd_lane_access = false;
3972       poly_uint64 vf;
3973
3974 again:
3975       if (!dr || !DR_REF (dr))
3976         {
3977           if (dump_enabled_p ())
3978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3979                              "not vectorized: unhandled data-ref\n");
3980           return false;
3981         }
3982
3983       stmt = DR_STMT (dr);
3984       stmt_info = vinfo_for_stmt (stmt);
3985
3986       /* Discard clobbers from the dataref vector.  We will remove
3987          clobber stmts during vectorization.  */
3988       if (gimple_clobber_p (stmt))
3989         {
3990           free_data_ref (dr);
3991           if (i == datarefs.length () - 1)
3992             {
3993               datarefs.pop ();
3994               break;
3995             }
3996           datarefs.ordered_remove (i);
3997           dr = datarefs[i];
3998           goto again;
3999         }
4000
4001       /* Check that analysis of the data-ref succeeded.  */
4002       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4003           || !DR_STEP (dr))
4004         {
4005           bool maybe_gather
4006             = DR_IS_READ (dr)
4007               && !TREE_THIS_VOLATILE (DR_REF (dr))
4008               && (targetm.vectorize.builtin_gather != NULL
4009                   || supports_vec_gather_load_p ());
4010           bool maybe_scatter
4011             = DR_IS_WRITE (dr)
4012               && !TREE_THIS_VOLATILE (DR_REF (dr))
4013               && (targetm.vectorize.builtin_scatter != NULL
4014                   || supports_vec_scatter_store_p ());
4015           bool maybe_simd_lane_access
4016             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4017
4018           /* If target supports vector gather loads or scatter stores, or if
4019              this might be a SIMD lane access, see if they can't be used.  */
4020           if (is_a <loop_vec_info> (vinfo)
4021               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4022               && !nested_in_vect_loop_p (loop, stmt))
4023             {
4024               struct data_reference *newdr
4025                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4026                                    DR_REF (dr), stmt, !maybe_scatter,
4027                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4028               gcc_assert (newdr != NULL && DR_REF (newdr));
4029               if (DR_BASE_ADDRESS (newdr)
4030                   && DR_OFFSET (newdr)
4031                   && DR_INIT (newdr)
4032                   && DR_STEP (newdr)
4033                   && integer_zerop (DR_STEP (newdr)))
4034                 {
4035                   if (maybe_simd_lane_access)
4036                     {
4037                       tree off = DR_OFFSET (newdr);
4038                       STRIP_NOPS (off);
4039                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4040                           && TREE_CODE (off) == MULT_EXPR
4041                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4042                         {
4043                           tree step = TREE_OPERAND (off, 1);
4044                           off = TREE_OPERAND (off, 0);
4045                           STRIP_NOPS (off);
4046                           if (CONVERT_EXPR_P (off)
4047                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4048                                                                           0)))
4049                                  < TYPE_PRECISION (TREE_TYPE (off)))
4050                             off = TREE_OPERAND (off, 0);
4051                           if (TREE_CODE (off) == SSA_NAME)
4052                             {
4053                               gimple *def = SSA_NAME_DEF_STMT (off);
4054                               tree reft = TREE_TYPE (DR_REF (newdr));
4055                               if (is_gimple_call (def)
4056                                   && gimple_call_internal_p (def)
4057                                   && (gimple_call_internal_fn (def)
4058                                       == IFN_GOMP_SIMD_LANE))
4059                                 {
4060                                   tree arg = gimple_call_arg (def, 0);
4061                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4062                                   arg = SSA_NAME_VAR (arg);
4063                                   if (arg == loop->simduid
4064                                       /* For now.  */
4065                                       && tree_int_cst_equal
4066                                            (TYPE_SIZE_UNIT (reft),
4067                                             step))
4068                                     {
4069                                       DR_OFFSET (newdr) = ssize_int (0);
4070                                       DR_STEP (newdr) = step;
4071                                       DR_OFFSET_ALIGNMENT (newdr)
4072                                         = BIGGEST_ALIGNMENT;
4073                                       DR_STEP_ALIGNMENT (newdr)
4074                                         = highest_pow2_factor (step);
4075                                       dr = newdr;
4076                                       simd_lane_access = true;
4077                                     }
4078                                 }
4079                             }
4080                         }
4081                     }
4082                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4083                     {
4084                       dr = newdr;
4085                       if (maybe_gather)
4086                         gatherscatter = GATHER;
4087                       else
4088                         gatherscatter = SCATTER;
4089                     }
4090                 }
4091               if (gatherscatter == SG_NONE && !simd_lane_access)
4092                 free_data_ref (newdr);
4093             }
4094
4095           if (gatherscatter == SG_NONE && !simd_lane_access)
4096             {
4097               if (dump_enabled_p ())
4098                 {
4099                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4100                                    "not vectorized: data ref analysis "
4101                                    "failed ");
4102                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4103                 }
4104
4105               if (is_a <bb_vec_info> (vinfo))
4106                 break;
4107
4108               return false;
4109             }
4110         }
4111
4112       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4113         {
4114           if (dump_enabled_p ())
4115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4116                              "not vectorized: base addr of dr is a "
4117                              "constant\n");
4118
4119           if (is_a <bb_vec_info> (vinfo))
4120             break;
4121
4122           if (gatherscatter != SG_NONE || simd_lane_access)
4123             free_data_ref (dr);
4124           return false;
4125         }
4126
4127       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4128         {
4129           if (dump_enabled_p ())
4130             {
4131               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4132                                "not vectorized: volatile type ");
4133               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4134             }
4135
4136           if (is_a <bb_vec_info> (vinfo))
4137             break;
4138
4139           return false;
4140         }
4141
4142       if (stmt_can_throw_internal (stmt))
4143         {
4144           if (dump_enabled_p ())
4145             {
4146               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4147                                "not vectorized: statement can throw an "
4148                                "exception ");
4149               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4150             }
4151
4152           if (is_a <bb_vec_info> (vinfo))
4153             break;
4154
4155           if (gatherscatter != SG_NONE || simd_lane_access)
4156             free_data_ref (dr);
4157           return false;
4158         }
4159
4160       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4161           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4162         {
4163           if (dump_enabled_p ())
4164             {
4165               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4166                                "not vectorized: statement is bitfield "
4167                                "access ");
4168               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4169             }
4170
4171           if (is_a <bb_vec_info> (vinfo))
4172             break;
4173
4174           if (gatherscatter != SG_NONE || simd_lane_access)
4175             free_data_ref (dr);
4176           return false;
4177         }
4178
4179       base = unshare_expr (DR_BASE_ADDRESS (dr));
4180       offset = unshare_expr (DR_OFFSET (dr));
4181       init = unshare_expr (DR_INIT (dr));
4182
4183       if (is_gimple_call (stmt)
4184           && (!gimple_call_internal_p (stmt)
4185               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4186                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4187         {
4188           if (dump_enabled_p ())
4189             {
4190               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4191                                "not vectorized: dr in a call ");
4192               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4193             }
4194
4195           if (is_a <bb_vec_info> (vinfo))
4196             break;
4197
4198           if (gatherscatter != SG_NONE || simd_lane_access)
4199             free_data_ref (dr);
4200           return false;
4201         }
4202
4203       /* Update DR field in stmt_vec_info struct.  */
4204
4205       /* If the dataref is in an inner-loop of the loop that is considered for
4206          for vectorization, we also want to analyze the access relative to
4207          the outer-loop (DR contains information only relative to the
4208          inner-most enclosing loop).  We do that by building a reference to the
4209          first location accessed by the inner-loop, and analyze it relative to
4210          the outer-loop.  */
4211       if (loop && nested_in_vect_loop_p (loop, stmt))
4212         {
4213           /* Build a reference to the first location accessed by the
4214              inner loop: *(BASE + INIT + OFFSET).  By construction,
4215              this address must be invariant in the inner loop, so we
4216              can consider it as being used in the outer loop.  */
4217           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4218                                           init, offset);
4219           tree init_addr = fold_build_pointer_plus (base, init_offset);
4220           tree init_ref = build_fold_indirect_ref (init_addr);
4221
4222           if (dump_enabled_p ())
4223             {
4224               dump_printf_loc (MSG_NOTE, vect_location,
4225                                "analyze in outer loop: ");
4226               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4227               dump_printf (MSG_NOTE, "\n");
4228             }
4229
4230           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4231                                      init_ref, loop))
4232             /* dr_analyze_innermost already explained the failure.  */
4233             return false;
4234
4235           if (dump_enabled_p ())
4236             {
4237               dump_printf_loc (MSG_NOTE, vect_location,
4238                                "\touter base_address: ");
4239               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4240                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4241               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4242               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4243                                  STMT_VINFO_DR_OFFSET (stmt_info));
4244               dump_printf (MSG_NOTE,
4245                            "\n\touter constant offset from base address: ");
4246               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4247                                  STMT_VINFO_DR_INIT (stmt_info));
4248               dump_printf (MSG_NOTE, "\n\touter step: ");
4249               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4250                                  STMT_VINFO_DR_STEP (stmt_info));
4251               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4252                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4253               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4254                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4255               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4256                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4257               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4258                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4259             }
4260         }
4261
4262       if (STMT_VINFO_DATA_REF (stmt_info))
4263         {
4264           if (dump_enabled_p ())
4265             {
4266               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4267                                "not vectorized: more than one data ref "
4268                                "in stmt: ");
4269               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4270             }
4271
4272           if (is_a <bb_vec_info> (vinfo))
4273             break;
4274
4275           if (gatherscatter != SG_NONE || simd_lane_access)
4276             free_data_ref (dr);
4277           return false;
4278         }
4279
4280       STMT_VINFO_DATA_REF (stmt_info) = dr;
4281       if (simd_lane_access)
4282         {
4283           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4284           free_data_ref (datarefs[i]);
4285           datarefs[i] = dr;
4286         }
4287
4288       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4289           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4290           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4291         {
4292           if (dump_enabled_p ())
4293             {
4294               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4295                                "not vectorized: base object not addressable "
4296                                "for stmt: ");
4297               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4298             }
4299           if (is_a <bb_vec_info> (vinfo))
4300             {
4301               /* In BB vectorization the ref can still participate
4302                  in dependence analysis, we just can't vectorize it.  */
4303               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4304               continue;
4305             }
4306           return false;
4307         }
4308
4309       /* Set vectype for STMT.  */
4310       scalar_type = TREE_TYPE (DR_REF (dr));
4311       STMT_VINFO_VECTYPE (stmt_info)
4312         = get_vectype_for_scalar_type (scalar_type);
4313       if (!STMT_VINFO_VECTYPE (stmt_info))
4314         {
4315           if (dump_enabled_p ())
4316             {
4317               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4318                                "not vectorized: no vectype for stmt: ");
4319               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4320               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4321               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4322                                  scalar_type);
4323               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4324             }
4325
4326           if (is_a <bb_vec_info> (vinfo))
4327             {
4328               /* No vector type is fine, the ref can still participate
4329                  in dependence analysis, we just can't vectorize it.  */
4330               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4331               continue;
4332             }
4333
4334           if (gatherscatter != SG_NONE || simd_lane_access)
4335             {
4336               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4337               if (gatherscatter != SG_NONE)
4338                 free_data_ref (dr);
4339             }
4340           return false;
4341         }
4342       else
4343         {
4344           if (dump_enabled_p ())
4345             {
4346               dump_printf_loc (MSG_NOTE, vect_location,
4347                                "got vectype for stmt: ");
4348               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4349               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4350                                  STMT_VINFO_VECTYPE (stmt_info));
4351               dump_printf (MSG_NOTE, "\n");
4352             }
4353         }
4354
4355       /* Adjust the minimal vectorization factor according to the
4356          vector type.  */
4357       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4358       *min_vf = upper_bound (*min_vf, vf);
4359
4360       if (gatherscatter != SG_NONE)
4361         {
4362           gather_scatter_info gs_info;
4363           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4364                                           &gs_info)
4365               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4366             {
4367               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4368               free_data_ref (dr);
4369               if (dump_enabled_p ())
4370                 {
4371                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4372                                    (gatherscatter == GATHER) ?
4373                                    "not vectorized: not suitable for gather "
4374                                    "load " :
4375                                    "not vectorized: not suitable for scatter "
4376                                    "store ");
4377                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4378                 }
4379               return false;
4380             }
4381
4382           free_data_ref (datarefs[i]);
4383           datarefs[i] = dr;
4384           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4385         }
4386
4387       else if (is_a <loop_vec_info> (vinfo)
4388                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4389         {
4390           if (nested_in_vect_loop_p (loop, stmt))
4391             {
4392               if (dump_enabled_p ())
4393                 {
4394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395                                    "not vectorized: not suitable for strided "
4396                                    "load ");
4397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4398                 }
4399               return false;
4400             }
4401           STMT_VINFO_STRIDED_P (stmt_info) = true;
4402         }
4403     }
4404
4405   /* If we stopped analysis at the first dataref we could not analyze
4406      when trying to vectorize a basic-block mark the rest of the datarefs
4407      as not vectorizable and truncate the vector of datarefs.  That
4408      avoids spending useless time in analyzing their dependence.  */
4409   if (i != datarefs.length ())
4410     {
4411       gcc_assert (is_a <bb_vec_info> (vinfo));
4412       for (unsigned j = i; j < datarefs.length (); ++j)
4413         {
4414           data_reference_p dr = datarefs[j];
4415           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4416           free_data_ref (dr);
4417         }
4418       datarefs.truncate (i);
4419     }
4420
4421   return true;
4422 }
4423
4424
4425 /* Function vect_get_new_vect_var.
4426
4427    Returns a name for a new variable.  The current naming scheme appends the
4428    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4429    the name of vectorizer generated variables, and appends that to NAME if
4430    provided.  */
4431
4432 tree
4433 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4434 {
4435   const char *prefix;
4436   tree new_vect_var;
4437
4438   switch (var_kind)
4439   {
4440   case vect_simple_var:
4441     prefix = "vect";
4442     break;
4443   case vect_scalar_var:
4444     prefix = "stmp";
4445     break;
4446   case vect_mask_var:
4447     prefix = "mask";
4448     break;
4449   case vect_pointer_var:
4450     prefix = "vectp";
4451     break;
4452   default:
4453     gcc_unreachable ();
4454   }
4455
4456   if (name)
4457     {
4458       char* tmp = concat (prefix, "_", name, NULL);
4459       new_vect_var = create_tmp_reg (type, tmp);
4460       free (tmp);
4461     }
4462   else
4463     new_vect_var = create_tmp_reg (type, prefix);
4464
4465   return new_vect_var;
4466 }
4467
4468 /* Like vect_get_new_vect_var but return an SSA name.  */
4469
4470 tree
4471 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4472 {
4473   const char *prefix;
4474   tree new_vect_var;
4475
4476   switch (var_kind)
4477   {
4478   case vect_simple_var:
4479     prefix = "vect";
4480     break;
4481   case vect_scalar_var:
4482     prefix = "stmp";
4483     break;
4484   case vect_pointer_var:
4485     prefix = "vectp";
4486     break;
4487   default:
4488     gcc_unreachable ();
4489   }
4490
4491   if (name)
4492     {
4493       char* tmp = concat (prefix, "_", name, NULL);
4494       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4495       free (tmp);
4496     }
4497   else
4498     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4499
4500   return new_vect_var;
4501 }
4502
4503 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4504
4505 static void
4506 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4507 {
4508   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4509   int misalign = DR_MISALIGNMENT (dr);
4510   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4511     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4512   else
4513     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4514                             DR_TARGET_ALIGNMENT (dr), misalign);
4515 }
4516
4517 /* Function vect_create_addr_base_for_vector_ref.
4518
4519    Create an expression that computes the address of the first memory location
4520    that will be accessed for a data reference.
4521
4522    Input:
4523    STMT: The statement containing the data reference.
4524    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4525    OFFSET: Optional. If supplied, it is be added to the initial address.
4526    LOOP:    Specify relative to which loop-nest should the address be computed.
4527             For example, when the dataref is in an inner-loop nested in an
4528             outer-loop that is now being vectorized, LOOP can be either the
4529             outer-loop, or the inner-loop.  The first memory location accessed
4530             by the following dataref ('in' points to short):
4531
4532                 for (i=0; i<N; i++)
4533                    for (j=0; j<M; j++)
4534                      s += in[i+j]
4535
4536             is as follows:
4537             if LOOP=i_loop:     &in             (relative to i_loop)
4538             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4539    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4540             initial address.  Unlike OFFSET, which is number of elements to
4541             be added, BYTE_OFFSET is measured in bytes.
4542
4543    Output:
4544    1. Return an SSA_NAME whose value is the address of the memory location of
4545       the first vector of the data reference.
4546    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4547       these statement(s) which define the returned SSA_NAME.
4548
4549    FORNOW: We are only handling array accesses with step 1.  */
4550
4551 tree
4552 vect_create_addr_base_for_vector_ref (gimple *stmt,
4553                                       gimple_seq *new_stmt_list,
4554                                       tree offset,
4555                                       tree byte_offset)
4556 {
4557   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4558   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4559   const char *base_name;
4560   tree addr_base;
4561   tree dest;
4562   gimple_seq seq = NULL;
4563   tree vect_ptr_type;
4564   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4565   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4566   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4567
4568   tree data_ref_base = unshare_expr (drb->base_address);
4569   tree base_offset = unshare_expr (drb->offset);
4570   tree init = unshare_expr (drb->init);
4571
4572   if (loop_vinfo)
4573     base_name = get_name (data_ref_base);
4574   else
4575     {
4576       base_offset = ssize_int (0);
4577       init = ssize_int (0);
4578       base_name = get_name (DR_REF (dr));
4579     }
4580
4581   /* Create base_offset */
4582   base_offset = size_binop (PLUS_EXPR,
4583                             fold_convert (sizetype, base_offset),
4584                             fold_convert (sizetype, init));
4585
4586   if (offset)
4587     {
4588       offset = fold_build2 (MULT_EXPR, sizetype,
4589                             fold_convert (sizetype, offset), step);
4590       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4591                                  base_offset, offset);
4592     }
4593   if (byte_offset)
4594     {
4595       byte_offset = fold_convert (sizetype, byte_offset);
4596       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4597                                  base_offset, byte_offset);
4598     }
4599
4600   /* base + base_offset */
4601   if (loop_vinfo)
4602     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4603   else
4604     {
4605       addr_base = build1 (ADDR_EXPR,
4606                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4607                           unshare_expr (DR_REF (dr)));
4608     }
4609
4610   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4611   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4612   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4613   gimple_seq_add_seq (new_stmt_list, seq);
4614
4615   if (DR_PTR_INFO (dr)
4616       && TREE_CODE (addr_base) == SSA_NAME
4617       && !SSA_NAME_PTR_INFO (addr_base))
4618     {
4619       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4620       if (offset || byte_offset)
4621         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4622     }
4623
4624   if (dump_enabled_p ())
4625     {
4626       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4627       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4628       dump_printf (MSG_NOTE, "\n");
4629     }
4630
4631   return addr_base;
4632 }
4633
4634
4635 /* Function vect_create_data_ref_ptr.
4636
4637    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4638    location accessed in the loop by STMT, along with the def-use update
4639    chain to appropriately advance the pointer through the loop iterations.
4640    Also set aliasing information for the pointer.  This pointer is used by
4641    the callers to this function to create a memory reference expression for
4642    vector load/store access.
4643
4644    Input:
4645    1. STMT: a stmt that references memory. Expected to be of the form
4646          GIMPLE_ASSIGN <name, data-ref> or
4647          GIMPLE_ASSIGN <data-ref, name>.
4648    2. AGGR_TYPE: the type of the reference, which should be either a vector
4649         or an array.
4650    3. AT_LOOP: the loop where the vector memref is to be created.
4651    4. OFFSET (optional): an offset to be added to the initial address accessed
4652         by the data-ref in STMT.
4653    5. BSI: location where the new stmts are to be placed if there is no loop
4654    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4655         pointing to the initial address.
4656    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4657         to the initial address accessed by the data-ref in STMT.  This is
4658         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4659         in bytes.
4660    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4661         to the IV during each iteration of the loop.  NULL says to move
4662         by one copy of AGGR_TYPE up or down, depending on the step of the
4663         data reference.
4664
4665    Output:
4666    1. Declare a new ptr to vector_type, and have it point to the base of the
4667       data reference (initial addressed accessed by the data reference).
4668       For example, for vector of type V8HI, the following code is generated:
4669
4670       v8hi *ap;
4671       ap = (v8hi *)initial_address;
4672
4673       if OFFSET is not supplied:
4674          initial_address = &a[init];
4675       if OFFSET is supplied:
4676          initial_address = &a[init + OFFSET];
4677       if BYTE_OFFSET is supplied:
4678          initial_address = &a[init] + BYTE_OFFSET;
4679
4680       Return the initial_address in INITIAL_ADDRESS.
4681
4682    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4683       update the pointer in each iteration of the loop.
4684
4685       Return the increment stmt that updates the pointer in PTR_INCR.
4686
4687    3. Set INV_P to true if the access pattern of the data reference in the
4688       vectorized loop is invariant.  Set it to false otherwise.
4689
4690    4. Return the pointer.  */
4691
4692 tree
4693 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4694                           tree offset, tree *initial_address,
4695                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4696                           bool only_init, bool *inv_p, tree byte_offset,
4697                           tree iv_step)
4698 {
4699   const char *base_name;
4700   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4701   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4702   struct loop *loop = NULL;
4703   bool nested_in_vect_loop = false;
4704   struct loop *containing_loop = NULL;
4705   tree aggr_ptr_type;
4706   tree aggr_ptr;
4707   tree new_temp;
4708   gimple_seq new_stmt_list = NULL;
4709   edge pe = NULL;
4710   basic_block new_bb;
4711   tree aggr_ptr_init;
4712   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4713   tree aptr;
4714   gimple_stmt_iterator incr_gsi;
4715   bool insert_after;
4716   tree indx_before_incr, indx_after_incr;
4717   gimple *incr;
4718   tree step;
4719   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4720
4721   gcc_assert (iv_step != NULL_TREE
4722               || TREE_CODE (aggr_type) == ARRAY_TYPE
4723               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4724
4725   if (loop_vinfo)
4726     {
4727       loop = LOOP_VINFO_LOOP (loop_vinfo);
4728       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4729       containing_loop = (gimple_bb (stmt))->loop_father;
4730       pe = loop_preheader_edge (loop);
4731     }
4732   else
4733     {
4734       gcc_assert (bb_vinfo);
4735       only_init = true;
4736       *ptr_incr = NULL;
4737     }
4738
4739   /* Check the step (evolution) of the load in LOOP, and record
4740      whether it's invariant.  */
4741   step = vect_dr_behavior (dr)->step;
4742   if (integer_zerop (step))
4743     *inv_p = true;
4744   else
4745     *inv_p = false;
4746
4747   /* Create an expression for the first address accessed by this load
4748      in LOOP.  */
4749   base_name = get_name (DR_BASE_ADDRESS (dr));
4750
4751   if (dump_enabled_p ())
4752     {
4753       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4754       dump_printf_loc (MSG_NOTE, vect_location,
4755                        "create %s-pointer variable to type: ",
4756                        get_tree_code_name (TREE_CODE (aggr_type)));
4757       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4758       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4759         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4760       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4761         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4762       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4763         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4764       else
4765         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4766       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4767       dump_printf (MSG_NOTE, "\n");
4768     }
4769
4770   /* (1) Create the new aggregate-pointer variable.
4771      Vector and array types inherit the alias set of their component
4772      type by default so we need to use a ref-all pointer if the data
4773      reference does not conflict with the created aggregated data
4774      reference because it is not addressable.  */
4775   bool need_ref_all = false;
4776   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4777                               get_alias_set (DR_REF (dr))))
4778     need_ref_all = true;
4779   /* Likewise for any of the data references in the stmt group.  */
4780   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4781     {
4782       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4783       do
4784         {
4785           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4786           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4787           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4788                                       get_alias_set (DR_REF (sdr))))
4789             {
4790               need_ref_all = true;
4791               break;
4792             }
4793           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4794         }
4795       while (orig_stmt);
4796     }
4797   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4798                                                need_ref_all);
4799   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4800
4801
4802   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4803      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4804      def-use update cycles for the pointer: one relative to the outer-loop
4805      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4806      to the inner-loop (which is the inner-most loop containing the dataref),
4807      and this is done be step (5) below.
4808
4809      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4810      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4811      redundant.  Steps (3),(4) create the following:
4812
4813         vp0 = &base_addr;
4814         LOOP:   vp1 = phi(vp0,vp2)
4815                 ...
4816                 ...
4817                 vp2 = vp1 + step
4818                 goto LOOP
4819
4820      If there is an inner-loop nested in loop, then step (5) will also be
4821      applied, and an additional update in the inner-loop will be created:
4822
4823         vp0 = &base_addr;
4824         LOOP:   vp1 = phi(vp0,vp2)
4825                 ...
4826         inner:     vp3 = phi(vp1,vp4)
4827                    vp4 = vp3 + inner_step
4828                    if () goto inner
4829                 ...
4830                 vp2 = vp1 + step
4831                 if () goto LOOP   */
4832
4833   /* (2) Calculate the initial address of the aggregate-pointer, and set
4834      the aggregate-pointer to point to it before the loop.  */
4835
4836   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4837
4838   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4839                                                    offset, byte_offset);
4840   if (new_stmt_list)
4841     {
4842       if (pe)
4843         {
4844           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4845           gcc_assert (!new_bb);
4846         }
4847       else
4848         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4849     }
4850
4851   *initial_address = new_temp;
4852   aggr_ptr_init = new_temp;
4853
4854   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4855      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4856      inner-loop nested in LOOP (during outer-loop vectorization).  */
4857
4858   /* No update in loop is required.  */
4859   if (only_init && (!loop_vinfo || at_loop == loop))
4860     aptr = aggr_ptr_init;
4861   else
4862     {
4863       if (iv_step == NULL_TREE)
4864         {
4865           /* The step of the aggregate pointer is the type size.  */
4866           iv_step = TYPE_SIZE_UNIT (aggr_type);
4867           /* One exception to the above is when the scalar step of the load in
4868              LOOP is zero. In this case the step here is also zero.  */
4869           if (*inv_p)
4870             iv_step = size_zero_node;
4871           else if (tree_int_cst_sgn (step) == -1)
4872             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4873         }
4874
4875       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4876
4877       create_iv (aggr_ptr_init,
4878                  fold_convert (aggr_ptr_type, iv_step),
4879                  aggr_ptr, loop, &incr_gsi, insert_after,
4880                  &indx_before_incr, &indx_after_incr);
4881       incr = gsi_stmt (incr_gsi);
4882       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4883
4884       /* Copy the points-to information if it exists. */
4885       if (DR_PTR_INFO (dr))
4886         {
4887           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4889         }
4890       if (ptr_incr)
4891         *ptr_incr = incr;
4892
4893       aptr = indx_before_incr;
4894     }
4895
4896   if (!nested_in_vect_loop || only_init)
4897     return aptr;
4898
4899
4900   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4901      nested in LOOP, if exists.  */
4902
4903   gcc_assert (nested_in_vect_loop);
4904   if (!only_init)
4905     {
4906       standard_iv_increment_position (containing_loop, &incr_gsi,
4907                                       &insert_after);
4908       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4909                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4910                  &indx_after_incr);
4911       incr = gsi_stmt (incr_gsi);
4912       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4913
4914       /* Copy the points-to information if it exists. */
4915       if (DR_PTR_INFO (dr))
4916         {
4917           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4918           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4919         }
4920       if (ptr_incr)
4921         *ptr_incr = incr;
4922
4923       return indx_before_incr;
4924     }
4925   else
4926     gcc_unreachable ();
4927 }
4928
4929
4930 /* Function bump_vector_ptr
4931
4932    Increment a pointer (to a vector type) by vector-size. If requested,
4933    i.e. if PTR-INCR is given, then also connect the new increment stmt
4934    to the existing def-use update-chain of the pointer, by modifying
4935    the PTR_INCR as illustrated below:
4936
4937    The pointer def-use update-chain before this function:
4938                         DATAREF_PTR = phi (p_0, p_2)
4939                         ....
4940         PTR_INCR:       p_2 = DATAREF_PTR + step
4941
4942    The pointer def-use update-chain after this function:
4943                         DATAREF_PTR = phi (p_0, p_2)
4944                         ....
4945                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4946                         ....
4947         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4948
4949    Input:
4950    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4951                  in the loop.
4952    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4953               the loop.  The increment amount across iterations is expected
4954               to be vector_size.
4955    BSI - location where the new update stmt is to be placed.
4956    STMT - the original scalar memory-access stmt that is being vectorized.
4957    BUMP - optional. The offset by which to bump the pointer. If not given,
4958           the offset is assumed to be vector_size.
4959
4960    Output: Return NEW_DATAREF_PTR as illustrated above.
4961
4962 */
4963
4964 tree
4965 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4966                  gimple *stmt, tree bump)
4967 {
4968   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4969   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4970   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4971   tree update = TYPE_SIZE_UNIT (vectype);
4972   gassign *incr_stmt;
4973   ssa_op_iter iter;
4974   use_operand_p use_p;
4975   tree new_dataref_ptr;
4976
4977   if (bump)
4978     update = bump;
4979
4980   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4981     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4982   else
4983     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4984   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4985                                    dataref_ptr, update);
4986   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4987
4988   /* Copy the points-to information if it exists. */
4989   if (DR_PTR_INFO (dr))
4990     {
4991       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4992       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4993     }
4994
4995   if (!ptr_incr)
4996     return new_dataref_ptr;
4997
4998   /* Update the vector-pointer's cross-iteration increment.  */
4999   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5000     {
5001       tree use = USE_FROM_PTR (use_p);
5002
5003       if (use == dataref_ptr)
5004         SET_USE (use_p, new_dataref_ptr);
5005       else
5006         gcc_assert (operand_equal_p (use, update, 0));
5007     }
5008
5009   return new_dataref_ptr;
5010 }
5011
5012
5013 /* Function vect_create_destination_var.
5014
5015    Create a new temporary of type VECTYPE.  */
5016
5017 tree
5018 vect_create_destination_var (tree scalar_dest, tree vectype)
5019 {
5020   tree vec_dest;
5021   const char *name;
5022   char *new_name;
5023   tree type;
5024   enum vect_var_kind kind;
5025
5026   kind = vectype
5027     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5028     ? vect_mask_var
5029     : vect_simple_var
5030     : vect_scalar_var;
5031   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5032
5033   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5034
5035   name = get_name (scalar_dest);
5036   if (name)
5037     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5038   else
5039     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5040   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5041   free (new_name);
5042
5043   return vec_dest;
5044 }
5045
5046 /* Function vect_grouped_store_supported.
5047
5048    Returns TRUE if interleave high and interleave low permutations
5049    are supported, and FALSE otherwise.  */
5050
5051 bool
5052 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5053 {
5054   machine_mode mode = TYPE_MODE (vectype);
5055
5056   /* vect_permute_store_chain requires the group size to be equal to 3 or
5057      be a power of two.  */
5058   if (count != 3 && exact_log2 (count) == -1)
5059     {
5060       if (dump_enabled_p ())
5061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062                          "the size of the group of accesses"
5063                          " is not a power of 2 or not eqaul to 3\n");
5064       return false;
5065     }
5066
5067   /* Check that the permutation is supported.  */
5068   if (VECTOR_MODE_P (mode))
5069     {
5070       unsigned int i;
5071       if (count == 3)
5072         {
5073           unsigned int j0 = 0, j1 = 0, j2 = 0;
5074           unsigned int i, j;
5075
5076           unsigned int nelt;
5077           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5078             {
5079               if (dump_enabled_p ())
5080                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5081                                  "cannot handle groups of 3 stores for"
5082                                  " variable-length vectors\n");
5083               return false;
5084             }
5085
5086           vec_perm_builder sel (nelt, nelt, 1);
5087           sel.quick_grow (nelt);
5088           vec_perm_indices indices;
5089           for (j = 0; j < 3; j++)
5090             {
5091               int nelt0 = ((3 - j) * nelt) % 3;
5092               int nelt1 = ((3 - j) * nelt + 1) % 3;
5093               int nelt2 = ((3 - j) * nelt + 2) % 3;
5094               for (i = 0; i < nelt; i++)
5095                 {
5096                   if (3 * i + nelt0 < nelt)
5097                     sel[3 * i + nelt0] = j0++;
5098                   if (3 * i + nelt1 < nelt)
5099                     sel[3 * i + nelt1] = nelt + j1++;
5100                   if (3 * i + nelt2 < nelt)
5101                     sel[3 * i + nelt2] = 0;
5102                 }
5103               indices.new_vector (sel, 2, nelt);
5104               if (!can_vec_perm_const_p (mode, indices))
5105                 {
5106                   if (dump_enabled_p ())
5107                     dump_printf (MSG_MISSED_OPTIMIZATION,
5108                                  "permutation op not supported by target.\n");
5109                   return false;
5110                 }
5111
5112               for (i = 0; i < nelt; i++)
5113                 {
5114                   if (3 * i + nelt0 < nelt)
5115                     sel[3 * i + nelt0] = 3 * i + nelt0;
5116                   if (3 * i + nelt1 < nelt)
5117                     sel[3 * i + nelt1] = 3 * i + nelt1;
5118                   if (3 * i + nelt2 < nelt)
5119                     sel[3 * i + nelt2] = nelt + j2++;
5120                 }
5121               indices.new_vector (sel, 2, nelt);
5122               if (!can_vec_perm_const_p (mode, indices))
5123                 {
5124                   if (dump_enabled_p ())
5125                     dump_printf (MSG_MISSED_OPTIMIZATION,
5126                                  "permutation op not supported by target.\n");
5127                   return false;
5128                 }
5129             }
5130           return true;
5131         }
5132       else
5133         {
5134           /* If length is not equal to 3 then only power of 2 is supported.  */
5135           gcc_assert (pow2p_hwi (count));
5136           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5137
5138           /* The encoding has 2 interleaved stepped patterns.  */
5139           vec_perm_builder sel (nelt, 2, 3);
5140           sel.quick_grow (6);
5141           for (i = 0; i < 3; i++)
5142             {
5143               sel[i * 2] = i;
5144               sel[i * 2 + 1] = i + nelt;
5145             }
5146           vec_perm_indices indices (sel, 2, nelt);
5147           if (can_vec_perm_const_p (mode, indices))
5148             {
5149               for (i = 0; i < 6; i++)
5150                 sel[i] += exact_div (nelt, 2);
5151               indices.new_vector (sel, 2, nelt);
5152               if (can_vec_perm_const_p (mode, indices))
5153                 return true;
5154             }
5155         }
5156     }
5157
5158   if (dump_enabled_p ())
5159     dump_printf (MSG_MISSED_OPTIMIZATION,
5160                  "permutaion op not supported by target.\n");
5161   return false;
5162 }
5163
5164
5165 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5166    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5167
5168 bool
5169 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5170                             bool masked_p)
5171 {
5172   if (masked_p)
5173     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5174                                          vec_mask_store_lanes_optab,
5175                                          vectype, count);
5176   else
5177     return vect_lanes_optab_supported_p ("vec_store_lanes",
5178                                          vec_store_lanes_optab,
5179                                          vectype, count);
5180 }
5181
5182
5183 /* Function vect_permute_store_chain.
5184
5185    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5186    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5187    the data correctly for the stores.  Return the final references for stores
5188    in RESULT_CHAIN.
5189
5190    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5191    The input is 4 vectors each containing 8 elements.  We assign a number to
5192    each element, the input sequence is:
5193
5194    1st vec:   0  1  2  3  4  5  6  7
5195    2nd vec:   8  9 10 11 12 13 14 15
5196    3rd vec:  16 17 18 19 20 21 22 23
5197    4th vec:  24 25 26 27 28 29 30 31
5198
5199    The output sequence should be:
5200
5201    1st vec:  0  8 16 24  1  9 17 25
5202    2nd vec:  2 10 18 26  3 11 19 27
5203    3rd vec:  4 12 20 28  5 13 21 30
5204    4th vec:  6 14 22 30  7 15 23 31
5205
5206    i.e., we interleave the contents of the four vectors in their order.
5207
5208    We use interleave_high/low instructions to create such output.  The input of
5209    each interleave_high/low operation is two vectors:
5210    1st vec    2nd vec
5211    0 1 2 3    4 5 6 7
5212    the even elements of the result vector are obtained left-to-right from the
5213    high/low elements of the first vector.  The odd elements of the result are
5214    obtained left-to-right from the high/low elements of the second vector.
5215    The output of interleave_high will be:   0 4 1 5
5216    and of interleave_low:                   2 6 3 7
5217
5218
5219    The permutation is done in log LENGTH stages.  In each stage interleave_high
5220    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5221    where the first argument is taken from the first half of DR_CHAIN and the
5222    second argument from it's second half.
5223    In our example,
5224
5225    I1: interleave_high (1st vec, 3rd vec)
5226    I2: interleave_low (1st vec, 3rd vec)
5227    I3: interleave_high (2nd vec, 4th vec)
5228    I4: interleave_low (2nd vec, 4th vec)
5229
5230    The output for the first stage is:
5231
5232    I1:  0 16  1 17  2 18  3 19
5233    I2:  4 20  5 21  6 22  7 23
5234    I3:  8 24  9 25 10 26 11 27
5235    I4: 12 28 13 29 14 30 15 31
5236
5237    The output of the second stage, i.e. the final result is:
5238
5239    I1:  0  8 16 24  1  9 17 25
5240    I2:  2 10 18 26  3 11 19 27
5241    I3:  4 12 20 28  5 13 21 30
5242    I4:  6 14 22 30  7 15 23 31.  */
5243
5244 void
5245 vect_permute_store_chain (vec<tree> dr_chain,
5246                           unsigned int length,
5247                           gimple *stmt,
5248                           gimple_stmt_iterator *gsi,
5249                           vec<tree> *result_chain)
5250 {
5251   tree vect1, vect2, high, low;
5252   gimple *perm_stmt;
5253   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5254   tree perm_mask_low, perm_mask_high;
5255   tree data_ref;
5256   tree perm3_mask_low, perm3_mask_high;
5257   unsigned int i, j, n, log_length = exact_log2 (length);
5258
5259   result_chain->quick_grow (length);
5260   memcpy (result_chain->address (), dr_chain.address (),
5261           length * sizeof (tree));
5262
5263   if (length == 3)
5264     {
5265       /* vect_grouped_store_supported ensures that this is constant.  */
5266       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5267       unsigned int j0 = 0, j1 = 0, j2 = 0;
5268
5269       vec_perm_builder sel (nelt, nelt, 1);
5270       sel.quick_grow (nelt);
5271       vec_perm_indices indices;
5272       for (j = 0; j < 3; j++)
5273         {
5274           int nelt0 = ((3 - j) * nelt) % 3;
5275           int nelt1 = ((3 - j) * nelt + 1) % 3;
5276           int nelt2 = ((3 - j) * nelt + 2) % 3;
5277
5278           for (i = 0; i < nelt; i++)
5279             {
5280               if (3 * i + nelt0 < nelt)
5281                 sel[3 * i + nelt0] = j0++;
5282               if (3 * i + nelt1 < nelt)
5283                 sel[3 * i + nelt1] = nelt + j1++;
5284               if (3 * i + nelt2 < nelt)
5285                 sel[3 * i + nelt2] = 0;
5286             }
5287           indices.new_vector (sel, 2, nelt);
5288           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5289
5290           for (i = 0; i < nelt; i++)
5291             {
5292               if (3 * i + nelt0 < nelt)
5293                 sel[3 * i + nelt0] = 3 * i + nelt0;
5294               if (3 * i + nelt1 < nelt)
5295                 sel[3 * i + nelt1] = 3 * i + nelt1;
5296               if (3 * i + nelt2 < nelt)
5297                 sel[3 * i + nelt2] = nelt + j2++;
5298             }
5299           indices.new_vector (sel, 2, nelt);
5300           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5301
5302           vect1 = dr_chain[0];
5303           vect2 = dr_chain[1];
5304
5305           /* Create interleaving stmt:
5306              low = VEC_PERM_EXPR <vect1, vect2,
5307                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5308                                    j + 2, nelt + j + 2, *, ...}>  */
5309           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5310           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5311                                            vect2, perm3_mask_low);
5312           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5313
5314           vect1 = data_ref;
5315           vect2 = dr_chain[2];
5316           /* Create interleaving stmt:
5317              low = VEC_PERM_EXPR <vect1, vect2,
5318                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5319                                    6, 7, nelt + j + 2, ...}>  */
5320           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5321           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5322                                            vect2, perm3_mask_high);
5323           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5324           (*result_chain)[j] = data_ref;
5325         }
5326     }
5327   else
5328     {
5329       /* If length is not equal to 3 then only power of 2 is supported.  */
5330       gcc_assert (pow2p_hwi (length));
5331
5332       /* The encoding has 2 interleaved stepped patterns.  */
5333       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5334       vec_perm_builder sel (nelt, 2, 3);
5335       sel.quick_grow (6);
5336       for (i = 0; i < 3; i++)
5337         {
5338           sel[i * 2] = i;
5339           sel[i * 2 + 1] = i + nelt;
5340         }
5341         vec_perm_indices indices (sel, 2, nelt);
5342         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5343
5344         for (i = 0; i < 6; i++)
5345           sel[i] += exact_div (nelt, 2);
5346         indices.new_vector (sel, 2, nelt);
5347         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5348
5349         for (i = 0, n = log_length; i < n; i++)
5350           {
5351             for (j = 0; j < length/2; j++)
5352               {
5353                 vect1 = dr_chain[j];
5354                 vect2 = dr_chain[j+length/2];
5355
5356                 /* Create interleaving stmt:
5357                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5358                                                         ...}>  */
5359                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5360                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5361                                                  vect2, perm_mask_high);
5362                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5363                 (*result_chain)[2*j] = high;
5364
5365                 /* Create interleaving stmt:
5366                    low = VEC_PERM_EXPR <vect1, vect2,
5367                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5368                                          ...}>  */
5369                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5370                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5371                                                  vect2, perm_mask_low);
5372                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5373                 (*result_chain)[2*j+1] = low;
5374               }
5375             memcpy (dr_chain.address (), result_chain->address (),
5376                     length * sizeof (tree));
5377           }
5378     }
5379 }
5380
5381 /* Function vect_setup_realignment
5382
5383    This function is called when vectorizing an unaligned load using
5384    the dr_explicit_realign[_optimized] scheme.
5385    This function generates the following code at the loop prolog:
5386
5387       p = initial_addr;
5388    x  msq_init = *(floor(p));   # prolog load
5389       realignment_token = call target_builtin;
5390     loop:
5391    x  msq = phi (msq_init, ---)
5392
5393    The stmts marked with x are generated only for the case of
5394    dr_explicit_realign_optimized.
5395
5396    The code above sets up a new (vector) pointer, pointing to the first
5397    location accessed by STMT, and a "floor-aligned" load using that pointer.
5398    It also generates code to compute the "realignment-token" (if the relevant
5399    target hook was defined), and creates a phi-node at the loop-header bb
5400    whose arguments are the result of the prolog-load (created by this
5401    function) and the result of a load that takes place in the loop (to be
5402    created by the caller to this function).
5403
5404    For the case of dr_explicit_realign_optimized:
5405    The caller to this function uses the phi-result (msq) to create the
5406    realignment code inside the loop, and sets up the missing phi argument,
5407    as follows:
5408     loop:
5409       msq = phi (msq_init, lsq)
5410       lsq = *(floor(p'));        # load in loop
5411       result = realign_load (msq, lsq, realignment_token);
5412
5413    For the case of dr_explicit_realign:
5414     loop:
5415       msq = *(floor(p));        # load in loop
5416       p' = p + (VS-1);
5417       lsq = *(floor(p'));       # load in loop
5418       result = realign_load (msq, lsq, realignment_token);
5419
5420    Input:
5421    STMT - (scalar) load stmt to be vectorized. This load accesses
5422           a memory location that may be unaligned.
5423    BSI - place where new code is to be inserted.
5424    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5425                               is used.
5426
5427    Output:
5428    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5429                        target hook, if defined.
5430    Return value - the result of the loop-header phi node.  */
5431
5432 tree
5433 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5434                         tree *realignment_token,
5435                         enum dr_alignment_support alignment_support_scheme,
5436                         tree init_addr,
5437                         struct loop **at_loop)
5438 {
5439   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5440   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5441   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5442   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5443   struct loop *loop = NULL;
5444   edge pe = NULL;
5445   tree scalar_dest = gimple_assign_lhs (stmt);
5446   tree vec_dest;
5447   gimple *inc;
5448   tree ptr;
5449   tree data_ref;
5450   basic_block new_bb;
5451   tree msq_init = NULL_TREE;
5452   tree new_temp;
5453   gphi *phi_stmt;
5454   tree msq = NULL_TREE;
5455   gimple_seq stmts = NULL;
5456   bool inv_p;
5457   bool compute_in_loop = false;
5458   bool nested_in_vect_loop = false;
5459   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5460   struct loop *loop_for_initial_load = NULL;
5461
5462   if (loop_vinfo)
5463     {
5464       loop = LOOP_VINFO_LOOP (loop_vinfo);
5465       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5466     }
5467
5468   gcc_assert (alignment_support_scheme == dr_explicit_realign
5469               || alignment_support_scheme == dr_explicit_realign_optimized);
5470
5471   /* We need to generate three things:
5472      1. the misalignment computation
5473      2. the extra vector load (for the optimized realignment scheme).
5474      3. the phi node for the two vectors from which the realignment is
5475       done (for the optimized realignment scheme).  */
5476
5477   /* 1. Determine where to generate the misalignment computation.
5478
5479      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5480      calculation will be generated by this function, outside the loop (in the
5481      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5482      caller, inside the loop.
5483
5484      Background: If the misalignment remains fixed throughout the iterations of
5485      the loop, then both realignment schemes are applicable, and also the
5486      misalignment computation can be done outside LOOP.  This is because we are
5487      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5488      are a multiple of VS (the Vector Size), and therefore the misalignment in
5489      different vectorized LOOP iterations is always the same.
5490      The problem arises only if the memory access is in an inner-loop nested
5491      inside LOOP, which is now being vectorized using outer-loop vectorization.
5492      This is the only case when the misalignment of the memory access may not
5493      remain fixed throughout the iterations of the inner-loop (as explained in
5494      detail in vect_supportable_dr_alignment).  In this case, not only is the
5495      optimized realignment scheme not applicable, but also the misalignment
5496      computation (and generation of the realignment token that is passed to
5497      REALIGN_LOAD) have to be done inside the loop.
5498
5499      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5500      or not, which in turn determines if the misalignment is computed inside
5501      the inner-loop, or outside LOOP.  */
5502
5503   if (init_addr != NULL_TREE || !loop_vinfo)
5504     {
5505       compute_in_loop = true;
5506       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5507     }
5508
5509
5510   /* 2. Determine where to generate the extra vector load.
5511
5512      For the optimized realignment scheme, instead of generating two vector
5513      loads in each iteration, we generate a single extra vector load in the
5514      preheader of the loop, and in each iteration reuse the result of the
5515      vector load from the previous iteration.  In case the memory access is in
5516      an inner-loop nested inside LOOP, which is now being vectorized using
5517      outer-loop vectorization, we need to determine whether this initial vector
5518      load should be generated at the preheader of the inner-loop, or can be
5519      generated at the preheader of LOOP.  If the memory access has no evolution
5520      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5521      to be generated inside LOOP (in the preheader of the inner-loop).  */
5522
5523   if (nested_in_vect_loop)
5524     {
5525       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5526       bool invariant_in_outerloop =
5527             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5528       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5529     }
5530   else
5531     loop_for_initial_load = loop;
5532   if (at_loop)
5533     *at_loop = loop_for_initial_load;
5534
5535   if (loop_for_initial_load)
5536     pe = loop_preheader_edge (loop_for_initial_load);
5537
5538   /* 3. For the case of the optimized realignment, create the first vector
5539       load at the loop preheader.  */
5540
5541   if (alignment_support_scheme == dr_explicit_realign_optimized)
5542     {
5543       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5544       gassign *new_stmt;
5545
5546       gcc_assert (!compute_in_loop);
5547       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5548       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5549                                       NULL_TREE, &init_addr, NULL, &inc,
5550                                       true, &inv_p);
5551       if (TREE_CODE (ptr) == SSA_NAME)
5552         new_temp = copy_ssa_name (ptr);
5553       else
5554         new_temp = make_ssa_name (TREE_TYPE (ptr));
5555       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5556       new_stmt = gimple_build_assign
5557                    (new_temp, BIT_AND_EXPR, ptr,
5558                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5559       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5560       gcc_assert (!new_bb);
5561       data_ref
5562         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5563                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5564       new_stmt = gimple_build_assign (vec_dest, data_ref);
5565       new_temp = make_ssa_name (vec_dest, new_stmt);
5566       gimple_assign_set_lhs (new_stmt, new_temp);
5567       if (pe)
5568         {
5569           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5570           gcc_assert (!new_bb);
5571         }
5572       else
5573          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5574
5575       msq_init = gimple_assign_lhs (new_stmt);
5576     }
5577
5578   /* 4. Create realignment token using a target builtin, if available.
5579       It is done either inside the containing loop, or before LOOP (as
5580       determined above).  */
5581
5582   if (targetm.vectorize.builtin_mask_for_load)
5583     {
5584       gcall *new_stmt;
5585       tree builtin_decl;
5586
5587       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5588       if (!init_addr)
5589         {
5590           /* Generate the INIT_ADDR computation outside LOOP.  */
5591           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5592                                                             NULL_TREE);
5593           if (loop)
5594             {
5595               pe = loop_preheader_edge (loop);
5596               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5597               gcc_assert (!new_bb);
5598             }
5599           else
5600              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5601         }
5602
5603       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5604       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5605       vec_dest =
5606         vect_create_destination_var (scalar_dest,
5607                                      gimple_call_return_type (new_stmt));
5608       new_temp = make_ssa_name (vec_dest, new_stmt);
5609       gimple_call_set_lhs (new_stmt, new_temp);
5610
5611       if (compute_in_loop)
5612         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5613       else
5614         {
5615           /* Generate the misalignment computation outside LOOP.  */
5616           pe = loop_preheader_edge (loop);
5617           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5618           gcc_assert (!new_bb);
5619         }
5620
5621       *realignment_token = gimple_call_lhs (new_stmt);
5622
5623       /* The result of the CALL_EXPR to this builtin is determined from
5624          the value of the parameter and no global variables are touched
5625          which makes the builtin a "const" function.  Requiring the
5626          builtin to have the "const" attribute makes it unnecessary
5627          to call mark_call_clobbered.  */
5628       gcc_assert (TREE_READONLY (builtin_decl));
5629     }
5630
5631   if (alignment_support_scheme == dr_explicit_realign)
5632     return msq;
5633
5634   gcc_assert (!compute_in_loop);
5635   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5636
5637
5638   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5639
5640   pe = loop_preheader_edge (containing_loop);
5641   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5642   msq = make_ssa_name (vec_dest);
5643   phi_stmt = create_phi_node (msq, containing_loop->header);
5644   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5645
5646   return msq;
5647 }
5648
5649
5650 /* Function vect_grouped_load_supported.
5651
5652    COUNT is the size of the load group (the number of statements plus the
5653    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5654    only one statement, with a gap of COUNT - 1.
5655
5656    Returns true if a suitable permute exists.  */
5657
5658 bool
5659 vect_grouped_load_supported (tree vectype, bool single_element_p,
5660                              unsigned HOST_WIDE_INT count)
5661 {
5662   machine_mode mode = TYPE_MODE (vectype);
5663
5664   /* If this is single-element interleaving with an element distance
5665      that leaves unused vector loads around punt - we at least create
5666      very sub-optimal code in that case (and blow up memory,
5667      see PR65518).  */
5668   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5669     {
5670       if (dump_enabled_p ())
5671         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5672                          "single-element interleaving not supported "
5673                          "for not adjacent vector loads\n");
5674       return false;
5675     }
5676
5677   /* vect_permute_load_chain requires the group size to be equal to 3 or
5678      be a power of two.  */
5679   if (count != 3 && exact_log2 (count) == -1)
5680     {
5681       if (dump_enabled_p ())
5682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5683                          "the size of the group of accesses"
5684                          " is not a power of 2 or not equal to 3\n");
5685       return false;
5686     }
5687
5688   /* Check that the permutation is supported.  */
5689   if (VECTOR_MODE_P (mode))
5690     {
5691       unsigned int i, j;
5692       if (count == 3)
5693         {
5694           unsigned int nelt;
5695           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5696             {
5697               if (dump_enabled_p ())
5698                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5699                                  "cannot handle groups of 3 loads for"
5700                                  " variable-length vectors\n");
5701               return false;
5702             }
5703
5704           vec_perm_builder sel (nelt, nelt, 1);
5705           sel.quick_grow (nelt);
5706           vec_perm_indices indices;
5707           unsigned int k;
5708           for (k = 0; k < 3; k++)
5709             {
5710               for (i = 0; i < nelt; i++)
5711                 if (3 * i + k < 2 * nelt)
5712                   sel[i] = 3 * i + k;
5713                 else
5714                   sel[i] = 0;
5715               indices.new_vector (sel, 2, nelt);
5716               if (!can_vec_perm_const_p (mode, indices))
5717                 {
5718                   if (dump_enabled_p ())
5719                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5720                                      "shuffle of 3 loads is not supported by"
5721                                      " target\n");
5722                   return false;
5723                 }
5724               for (i = 0, j = 0; i < nelt; i++)
5725                 if (3 * i + k < 2 * nelt)
5726                   sel[i] = i;
5727                 else
5728                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5729               indices.new_vector (sel, 2, nelt);
5730               if (!can_vec_perm_const_p (mode, indices))
5731                 {
5732                   if (dump_enabled_p ())
5733                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734                                      "shuffle of 3 loads is not supported by"
5735                                      " target\n");
5736                   return false;
5737                 }
5738             }
5739           return true;
5740         }
5741       else
5742         {
5743           /* If length is not equal to 3 then only power of 2 is supported.  */
5744           gcc_assert (pow2p_hwi (count));
5745           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5746
5747           /* The encoding has a single stepped pattern.  */
5748           vec_perm_builder sel (nelt, 1, 3);
5749           sel.quick_grow (3);
5750           for (i = 0; i < 3; i++)
5751             sel[i] = i * 2;
5752           vec_perm_indices indices (sel, 2, nelt);
5753           if (can_vec_perm_const_p (mode, indices))
5754             {
5755               for (i = 0; i < 3; i++)
5756                 sel[i] = i * 2 + 1;
5757               indices.new_vector (sel, 2, nelt);
5758               if (can_vec_perm_const_p (mode, indices))
5759                 return true;
5760             }
5761         }
5762     }
5763
5764   if (dump_enabled_p ())
5765     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766                      "extract even/odd not supported by target\n");
5767   return false;
5768 }
5769
5770 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5771    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5772
5773 bool
5774 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5775                            bool masked_p)
5776 {
5777   if (masked_p)
5778     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5779                                          vec_mask_load_lanes_optab,
5780                                          vectype, count);
5781   else
5782     return vect_lanes_optab_supported_p ("vec_load_lanes",
5783                                          vec_load_lanes_optab,
5784                                          vectype, count);
5785 }
5786
5787 /* Function vect_permute_load_chain.
5788
5789    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5790    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5791    the input data correctly.  Return the final references for loads in
5792    RESULT_CHAIN.
5793
5794    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5795    The input is 4 vectors each containing 8 elements. We assign a number to each
5796    element, the input sequence is:
5797
5798    1st vec:   0  1  2  3  4  5  6  7
5799    2nd vec:   8  9 10 11 12 13 14 15
5800    3rd vec:  16 17 18 19 20 21 22 23
5801    4th vec:  24 25 26 27 28 29 30 31
5802
5803    The output sequence should be:
5804
5805    1st vec:  0 4  8 12 16 20 24 28
5806    2nd vec:  1 5  9 13 17 21 25 29
5807    3rd vec:  2 6 10 14 18 22 26 30
5808    4th vec:  3 7 11 15 19 23 27 31
5809
5810    i.e., the first output vector should contain the first elements of each
5811    interleaving group, etc.
5812
5813    We use extract_even/odd instructions to create such output.  The input of
5814    each extract_even/odd operation is two vectors
5815    1st vec    2nd vec
5816    0 1 2 3    4 5 6 7
5817
5818    and the output is the vector of extracted even/odd elements.  The output of
5819    extract_even will be:   0 2 4 6
5820    and of extract_odd:     1 3 5 7
5821
5822
5823    The permutation is done in log LENGTH stages.  In each stage extract_even
5824    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5825    their order.  In our example,
5826
5827    E1: extract_even (1st vec, 2nd vec)
5828    E2: extract_odd (1st vec, 2nd vec)
5829    E3: extract_even (3rd vec, 4th vec)
5830    E4: extract_odd (3rd vec, 4th vec)
5831
5832    The output for the first stage will be:
5833
5834    E1:  0  2  4  6  8 10 12 14
5835    E2:  1  3  5  7  9 11 13 15
5836    E3: 16 18 20 22 24 26 28 30
5837    E4: 17 19 21 23 25 27 29 31
5838
5839    In order to proceed and create the correct sequence for the next stage (or
5840    for the correct output, if the second stage is the last one, as in our
5841    example), we first put the output of extract_even operation and then the
5842    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5843    The input for the second stage is:
5844
5845    1st vec (E1):  0  2  4  6  8 10 12 14
5846    2nd vec (E3): 16 18 20 22 24 26 28 30
5847    3rd vec (E2):  1  3  5  7  9 11 13 15
5848    4th vec (E4): 17 19 21 23 25 27 29 31
5849
5850    The output of the second stage:
5851
5852    E1: 0 4  8 12 16 20 24 28
5853    E2: 2 6 10 14 18 22 26 30
5854    E3: 1 5  9 13 17 21 25 29
5855    E4: 3 7 11 15 19 23 27 31
5856
5857    And RESULT_CHAIN after reordering:
5858
5859    1st vec (E1):  0 4  8 12 16 20 24 28
5860    2nd vec (E3):  1 5  9 13 17 21 25 29
5861    3rd vec (E2):  2 6 10 14 18 22 26 30
5862    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5863
5864 static void
5865 vect_permute_load_chain (vec<tree> dr_chain,
5866                          unsigned int length,
5867                          gimple *stmt,
5868                          gimple_stmt_iterator *gsi,
5869                          vec<tree> *result_chain)
5870 {
5871   tree data_ref, first_vect, second_vect;
5872   tree perm_mask_even, perm_mask_odd;
5873   tree perm3_mask_low, perm3_mask_high;
5874   gimple *perm_stmt;
5875   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5876   unsigned int i, j, log_length = exact_log2 (length);
5877
5878   result_chain->quick_grow (length);
5879   memcpy (result_chain->address (), dr_chain.address (),
5880           length * sizeof (tree));
5881
5882   if (length == 3)
5883     {
5884       /* vect_grouped_load_supported ensures that this is constant.  */
5885       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5886       unsigned int k;
5887
5888       vec_perm_builder sel (nelt, nelt, 1);
5889       sel.quick_grow (nelt);
5890       vec_perm_indices indices;
5891       for (k = 0; k < 3; k++)
5892         {
5893           for (i = 0; i < nelt; i++)
5894             if (3 * i + k < 2 * nelt)
5895               sel[i] = 3 * i + k;
5896             else
5897               sel[i] = 0;
5898           indices.new_vector (sel, 2, nelt);
5899           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5900
5901           for (i = 0, j = 0; i < nelt; i++)
5902             if (3 * i + k < 2 * nelt)
5903               sel[i] = i;
5904             else
5905               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5906           indices.new_vector (sel, 2, nelt);
5907           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5908
5909           first_vect = dr_chain[0];
5910           second_vect = dr_chain[1];
5911
5912           /* Create interleaving stmt (low part of):
5913              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5914                                                              ...}>  */
5915           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5916           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5917                                            second_vect, perm3_mask_low);
5918           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5919
5920           /* Create interleaving stmt (high part of):
5921              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5922                                                               ...}>  */
5923           first_vect = data_ref;
5924           second_vect = dr_chain[2];
5925           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5926           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5927                                            second_vect, perm3_mask_high);
5928           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5929           (*result_chain)[k] = data_ref;
5930         }
5931     }
5932   else
5933     {
5934       /* If length is not equal to 3 then only power of 2 is supported.  */
5935       gcc_assert (pow2p_hwi (length));
5936
5937       /* The encoding has a single stepped pattern.  */
5938       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5939       vec_perm_builder sel (nelt, 1, 3);
5940       sel.quick_grow (3);
5941       for (i = 0; i < 3; ++i)
5942         sel[i] = i * 2;
5943       vec_perm_indices indices (sel, 2, nelt);
5944       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5945
5946       for (i = 0; i < 3; ++i)
5947         sel[i] = i * 2 + 1;
5948       indices.new_vector (sel, 2, nelt);
5949       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5950
5951       for (i = 0; i < log_length; i++)
5952         {
5953           for (j = 0; j < length; j += 2)
5954             {
5955               first_vect = dr_chain[j];
5956               second_vect = dr_chain[j+1];
5957
5958               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5959               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5960               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5961                                                first_vect, second_vect,
5962                                                perm_mask_even);
5963               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5964               (*result_chain)[j/2] = data_ref;
5965
5966               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5967               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5968               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5969                                                first_vect, second_vect,
5970                                                perm_mask_odd);
5971               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5972               (*result_chain)[j/2+length/2] = data_ref;
5973             }
5974           memcpy (dr_chain.address (), result_chain->address (),
5975                   length * sizeof (tree));
5976         }
5977     }
5978 }
5979
5980 /* Function vect_shift_permute_load_chain.
5981
5982    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5983    sequence of stmts to reorder the input data accordingly.
5984    Return the final references for loads in RESULT_CHAIN.
5985    Return true if successed, false otherwise.
5986
5987    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5988    The input is 3 vectors each containing 8 elements.  We assign a
5989    number to each element, the input sequence is:
5990
5991    1st vec:   0  1  2  3  4  5  6  7
5992    2nd vec:   8  9 10 11 12 13 14 15
5993    3rd vec:  16 17 18 19 20 21 22 23
5994
5995    The output sequence should be:
5996
5997    1st vec:  0 3 6  9 12 15 18 21
5998    2nd vec:  1 4 7 10 13 16 19 22
5999    3rd vec:  2 5 8 11 14 17 20 23
6000
6001    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6002
6003    First we shuffle all 3 vectors to get correct elements order:
6004
6005    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6006    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6007    3rd vec:  (16 19 22) (17 20 23) (18 21)
6008
6009    Next we unite and shift vector 3 times:
6010
6011    1st step:
6012      shift right by 6 the concatenation of:
6013      "1st vec" and  "2nd vec"
6014        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6015      "2nd vec" and  "3rd vec"
6016        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6017      "3rd vec" and  "1st vec"
6018        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6019                              | New vectors                   |
6020
6021      So that now new vectors are:
6022
6023      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6024      2nd vec:  (10 13) (16 19 22) (17 20 23)
6025      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6026
6027    2nd step:
6028      shift right by 5 the concatenation of:
6029      "1st vec" and  "3rd vec"
6030        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6031      "2nd vec" and  "1st vec"
6032        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6033      "3rd vec" and  "2nd vec"
6034        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6035                           | New vectors                   |
6036
6037      So that now new vectors are:
6038
6039      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6040      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6041      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6042
6043    3rd step:
6044      shift right by 5 the concatenation of:
6045      "1st vec" and  "1st vec"
6046        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6047      shift right by 3 the concatenation of:
6048      "2nd vec" and  "2nd vec"
6049                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6050                           | New vectors                   |
6051
6052      So that now all vectors are READY:
6053      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6054      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6055      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6056
6057    This algorithm is faster than one in vect_permute_load_chain if:
6058      1.  "shift of a concatination" is faster than general permutation.
6059          This is usually so.
6060      2.  The TARGET machine can't execute vector instructions in parallel.
6061          This is because each step of the algorithm depends on previous.
6062          The algorithm in vect_permute_load_chain is much more parallel.
6063
6064    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6065 */
6066
6067 static bool
6068 vect_shift_permute_load_chain (vec<tree> dr_chain,
6069                                unsigned int length,
6070                                gimple *stmt,
6071                                gimple_stmt_iterator *gsi,
6072                                vec<tree> *result_chain)
6073 {
6074   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6075   tree perm2_mask1, perm2_mask2, perm3_mask;
6076   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6077   gimple *perm_stmt;
6078
6079   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6080   unsigned int i;
6081   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6082   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6083
6084   unsigned HOST_WIDE_INT nelt, vf;
6085   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6086       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6087     /* Not supported for variable-length vectors.  */
6088     return false;
6089
6090   vec_perm_builder sel (nelt, nelt, 1);
6091   sel.quick_grow (nelt);
6092
6093   result_chain->quick_grow (length);
6094   memcpy (result_chain->address (), dr_chain.address (),
6095           length * sizeof (tree));
6096
6097   if (pow2p_hwi (length) && vf > 4)
6098     {
6099       unsigned int j, log_length = exact_log2 (length);
6100       for (i = 0; i < nelt / 2; ++i)
6101         sel[i] = i * 2;
6102       for (i = 0; i < nelt / 2; ++i)
6103         sel[nelt / 2 + i] = i * 2 + 1;
6104       vec_perm_indices indices (sel, 2, nelt);
6105       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6106         {
6107           if (dump_enabled_p ())
6108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6109                              "shuffle of 2 fields structure is not \
6110                               supported by target\n");
6111           return false;
6112         }
6113       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6114
6115       for (i = 0; i < nelt / 2; ++i)
6116         sel[i] = i * 2 + 1;
6117       for (i = 0; i < nelt / 2; ++i)
6118         sel[nelt / 2 + i] = i * 2;
6119       indices.new_vector (sel, 2, nelt);
6120       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6121         {
6122           if (dump_enabled_p ())
6123             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6124                              "shuffle of 2 fields structure is not \
6125                               supported by target\n");
6126           return false;
6127         }
6128       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6129
6130       /* Generating permutation constant to shift all elements.
6131          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6132       for (i = 0; i < nelt; i++)
6133         sel[i] = nelt / 2 + i;
6134       indices.new_vector (sel, 2, nelt);
6135       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6136         {
6137           if (dump_enabled_p ())
6138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139                              "shift permutation is not supported by target\n");
6140           return false;
6141         }
6142       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6143
6144       /* Generating permutation constant to select vector from 2.
6145          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6146       for (i = 0; i < nelt / 2; i++)
6147         sel[i] = i;
6148       for (i = nelt / 2; i < nelt; i++)
6149         sel[i] = nelt + i;
6150       indices.new_vector (sel, 2, nelt);
6151       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6152         {
6153           if (dump_enabled_p ())
6154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6155                              "select is not supported by target\n");
6156           return false;
6157         }
6158       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6159
6160       for (i = 0; i < log_length; i++)
6161         {
6162           for (j = 0; j < length; j += 2)
6163             {
6164               first_vect = dr_chain[j];
6165               second_vect = dr_chain[j + 1];
6166
6167               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6168               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169                                                first_vect, first_vect,
6170                                                perm2_mask1);
6171               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6172               vect[0] = data_ref;
6173
6174               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6175               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176                                                second_vect, second_vect,
6177                                                perm2_mask2);
6178               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6179               vect[1] = data_ref;
6180
6181               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6182               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6183                                                vect[0], vect[1], shift1_mask);
6184               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6185               (*result_chain)[j/2 + length/2] = data_ref;
6186
6187               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6188               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6189                                                vect[0], vect[1], select_mask);
6190               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6191               (*result_chain)[j/2] = data_ref;
6192             }
6193           memcpy (dr_chain.address (), result_chain->address (),
6194                   length * sizeof (tree));
6195         }
6196       return true;
6197     }
6198   if (length == 3 && vf > 2)
6199     {
6200       unsigned int k = 0, l = 0;
6201
6202       /* Generating permutation constant to get all elements in rigth order.
6203          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6204       for (i = 0; i < nelt; i++)
6205         {
6206           if (3 * k + (l % 3) >= nelt)
6207             {
6208               k = 0;
6209               l += (3 - (nelt % 3));
6210             }
6211           sel[i] = 3 * k + (l % 3);
6212           k++;
6213         }
6214       vec_perm_indices indices (sel, 2, nelt);
6215       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6216         {
6217           if (dump_enabled_p ())
6218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6219                              "shuffle of 3 fields structure is not \
6220                               supported by target\n");
6221           return false;
6222         }
6223       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6224
6225       /* Generating permutation constant to shift all elements.
6226          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6227       for (i = 0; i < nelt; i++)
6228         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6229       indices.new_vector (sel, 2, nelt);
6230       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6231         {
6232           if (dump_enabled_p ())
6233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6234                              "shift permutation is not supported by target\n");
6235           return false;
6236         }
6237       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6238
6239       /* Generating permutation constant to shift all elements.
6240          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6241       for (i = 0; i < nelt; i++)
6242         sel[i] = 2 * (nelt / 3) + 1 + i;
6243       indices.new_vector (sel, 2, nelt);
6244       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6245         {
6246           if (dump_enabled_p ())
6247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248                              "shift permutation is not supported by target\n");
6249           return false;
6250         }
6251       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6252
6253       /* Generating permutation constant to shift all elements.
6254          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6255       for (i = 0; i < nelt; i++)
6256         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6257       indices.new_vector (sel, 2, nelt);
6258       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259         {
6260           if (dump_enabled_p ())
6261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262                              "shift permutation is not supported by target\n");
6263           return false;
6264         }
6265       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6266
6267       /* Generating permutation constant to shift all elements.
6268          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6269       for (i = 0; i < nelt; i++)
6270         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6271       indices.new_vector (sel, 2, nelt);
6272       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6273         {
6274           if (dump_enabled_p ())
6275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6276                              "shift permutation is not supported by target\n");
6277           return false;
6278         }
6279       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6280
6281       for (k = 0; k < 3; k++)
6282         {
6283           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6284           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6285                                            dr_chain[k], dr_chain[k],
6286                                            perm3_mask);
6287           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6288           vect[k] = data_ref;
6289         }
6290
6291       for (k = 0; k < 3; k++)
6292         {
6293           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6294           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6295                                            vect[k % 3], vect[(k + 1) % 3],
6296                                            shift1_mask);
6297           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6298           vect_shift[k] = data_ref;
6299         }
6300
6301       for (k = 0; k < 3; k++)
6302         {
6303           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6304           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6305                                            vect_shift[(4 - k) % 3],
6306                                            vect_shift[(3 - k) % 3],
6307                                            shift2_mask);
6308           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6309           vect[k] = data_ref;
6310         }
6311
6312       (*result_chain)[3 - (nelt % 3)] = vect[2];
6313
6314       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6315       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6316                                        vect[0], shift3_mask);
6317       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6318       (*result_chain)[nelt % 3] = data_ref;
6319
6320       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6321       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6322                                        vect[1], shift4_mask);
6323       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6324       (*result_chain)[0] = data_ref;
6325       return true;
6326     }
6327   return false;
6328 }
6329
6330 /* Function vect_transform_grouped_load.
6331
6332    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6333    to perform their permutation and ascribe the result vectorized statements to
6334    the scalar statements.
6335 */
6336
6337 void
6338 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6339                              gimple_stmt_iterator *gsi)
6340 {
6341   machine_mode mode;
6342   vec<tree> result_chain = vNULL;
6343
6344   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6345      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6346      vectors, that are ready for vector computation.  */
6347   result_chain.create (size);
6348
6349   /* If reassociation width for vector type is 2 or greater target machine can
6350      execute 2 or more vector instructions in parallel.  Otherwise try to
6351      get chain for loads group using vect_shift_permute_load_chain.  */
6352   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6353   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6354       || pow2p_hwi (size)
6355       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6356                                          gsi, &result_chain))
6357     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6358   vect_record_grouped_load_vectors (stmt, result_chain);
6359   result_chain.release ();
6360 }
6361
6362 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6363    generated as part of the vectorization of STMT.  Assign the statement
6364    for each vector to the associated scalar statement.  */
6365
6366 void
6367 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6368 {
6369   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6370   gimple *next_stmt, *new_stmt;
6371   unsigned int i, gap_count;
6372   tree tmp_data_ref;
6373
6374   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6375      Since we scan the chain starting from it's first node, their order
6376      corresponds the order of data-refs in RESULT_CHAIN.  */
6377   next_stmt = first_stmt;
6378   gap_count = 1;
6379   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6380     {
6381       if (!next_stmt)
6382         break;
6383
6384       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6385        code elimination pass later.  No need to check for the first stmt in
6386        the group, since it always exists.
6387        GROUP_GAP is the number of steps in elements from the previous
6388        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6389        correspond to the gaps.  */
6390       if (next_stmt != first_stmt
6391           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6392       {
6393         gap_count++;
6394         continue;
6395       }
6396
6397       while (next_stmt)
6398         {
6399           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6400           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6401              copies, and we put the new vector statement in the first available
6402              RELATED_STMT.  */
6403           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6404             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6405           else
6406             {
6407               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6408                 {
6409                   gimple *prev_stmt =
6410                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6411                   gimple *rel_stmt =
6412                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6413                   while (rel_stmt)
6414                     {
6415                       prev_stmt = rel_stmt;
6416                       rel_stmt =
6417                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6418                     }
6419
6420                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6421                     new_stmt;
6422                 }
6423             }
6424
6425           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6426           gap_count = 1;
6427           /* If NEXT_STMT accesses the same DR as the previous statement,
6428              put the same TMP_DATA_REF as its vectorized statement; otherwise
6429              get the next data-ref from RESULT_CHAIN.  */
6430           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6431             break;
6432         }
6433     }
6434 }
6435
6436 /* Function vect_force_dr_alignment_p.
6437
6438    Returns whether the alignment of a DECL can be forced to be aligned
6439    on ALIGNMENT bit boundary.  */
6440
6441 bool
6442 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6443 {
6444   if (!VAR_P (decl))
6445     return false;
6446
6447   if (decl_in_symtab_p (decl)
6448       && !symtab_node::get (decl)->can_increase_alignment_p ())
6449     return false;
6450
6451   if (TREE_STATIC (decl))
6452     return (alignment <= MAX_OFILE_ALIGNMENT);
6453   else
6454     return (alignment <= MAX_STACK_ALIGNMENT);
6455 }
6456
6457
6458 /* Return whether the data reference DR is supported with respect to its
6459    alignment.
6460    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6461    it is aligned, i.e., check if it is possible to vectorize it with different
6462    alignment.  */
6463
6464 enum dr_alignment_support
6465 vect_supportable_dr_alignment (struct data_reference *dr,
6466                                bool check_aligned_accesses)
6467 {
6468   gimple *stmt = DR_STMT (dr);
6469   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6470   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6471   machine_mode mode = TYPE_MODE (vectype);
6472   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6473   struct loop *vect_loop = NULL;
6474   bool nested_in_vect_loop = false;
6475
6476   if (aligned_access_p (dr) && !check_aligned_accesses)
6477     return dr_aligned;
6478
6479   /* For now assume all conditional loads/stores support unaligned
6480      access without any special code.  */
6481   if (is_gimple_call (stmt)
6482       && gimple_call_internal_p (stmt)
6483       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6484           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6485     return dr_unaligned_supported;
6486
6487   if (loop_vinfo)
6488     {
6489       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6490       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6491     }
6492
6493   /* Possibly unaligned access.  */
6494
6495   /* We can choose between using the implicit realignment scheme (generating
6496      a misaligned_move stmt) and the explicit realignment scheme (generating
6497      aligned loads with a REALIGN_LOAD).  There are two variants to the
6498      explicit realignment scheme: optimized, and unoptimized.
6499      We can optimize the realignment only if the step between consecutive
6500      vector loads is equal to the vector size.  Since the vector memory
6501      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6502      is guaranteed that the misalignment amount remains the same throughout the
6503      execution of the vectorized loop.  Therefore, we can create the
6504      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6505      at the loop preheader.
6506
6507      However, in the case of outer-loop vectorization, when vectorizing a
6508      memory access in the inner-loop nested within the LOOP that is now being
6509      vectorized, while it is guaranteed that the misalignment of the
6510      vectorized memory access will remain the same in different outer-loop
6511      iterations, it is *not* guaranteed that is will remain the same throughout
6512      the execution of the inner-loop.  This is because the inner-loop advances
6513      with the original scalar step (and not in steps of VS).  If the inner-loop
6514      step happens to be a multiple of VS, then the misalignment remains fixed
6515      and we can use the optimized realignment scheme.  For example:
6516
6517       for (i=0; i<N; i++)
6518         for (j=0; j<M; j++)
6519           s += a[i+j];
6520
6521      When vectorizing the i-loop in the above example, the step between
6522      consecutive vector loads is 1, and so the misalignment does not remain
6523      fixed across the execution of the inner-loop, and the realignment cannot
6524      be optimized (as illustrated in the following pseudo vectorized loop):
6525
6526       for (i=0; i<N; i+=4)
6527         for (j=0; j<M; j++){
6528           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6529                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6530                          // (assuming that we start from an aligned address).
6531           }
6532
6533      We therefore have to use the unoptimized realignment scheme:
6534
6535       for (i=0; i<N; i+=4)
6536           for (j=k; j<M; j+=4)
6537           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6538                            // that the misalignment of the initial address is
6539                            // 0).
6540
6541      The loop can then be vectorized as follows:
6542
6543       for (k=0; k<4; k++){
6544         rt = get_realignment_token (&vp[k]);
6545         for (i=0; i<N; i+=4){
6546           v1 = vp[i+k];
6547           for (j=k; j<M; j+=4){
6548             v2 = vp[i+j+VS-1];
6549             va = REALIGN_LOAD <v1,v2,rt>;
6550             vs += va;
6551             v1 = v2;
6552           }
6553         }
6554     } */
6555
6556   if (DR_IS_READ (dr))
6557     {
6558       bool is_packed = false;
6559       tree type = (TREE_TYPE (DR_REF (dr)));
6560
6561       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6562           && (!targetm.vectorize.builtin_mask_for_load
6563               || targetm.vectorize.builtin_mask_for_load ()))
6564         {
6565           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6566
6567           /* If we are doing SLP then the accesses need not have the
6568              same alignment, instead it depends on the SLP group size.  */
6569           if (loop_vinfo
6570               && STMT_SLP_TYPE (stmt_info)
6571               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6572                               * GROUP_SIZE (vinfo_for_stmt
6573                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6574                               TYPE_VECTOR_SUBPARTS (vectype)))
6575             ;
6576           else if (!loop_vinfo
6577                    || (nested_in_vect_loop
6578                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6579                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6580             return dr_explicit_realign;
6581           else
6582             return dr_explicit_realign_optimized;
6583         }
6584       if (!known_alignment_for_access_p (dr))
6585         is_packed = not_size_aligned (DR_REF (dr));
6586
6587       if (targetm.vectorize.support_vector_misalignment
6588             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6589         /* Can't software pipeline the loads, but can at least do them.  */
6590         return dr_unaligned_supported;
6591     }
6592   else
6593     {
6594       bool is_packed = false;
6595       tree type = (TREE_TYPE (DR_REF (dr)));
6596
6597       if (!known_alignment_for_access_p (dr))
6598         is_packed = not_size_aligned (DR_REF (dr));
6599
6600      if (targetm.vectorize.support_vector_misalignment
6601            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6602        return dr_unaligned_supported;
6603     }
6604
6605   /* Unsupported.  */
6606   return dr_unaligned_unsupported;
6607 }