gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 217 }
 218
 219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 221    distances.  These distances are conservatively correct but they don't
 222    reflect a guaranteed dependence.
 223
 224    Return true if this function does all the work necessary to avoid
 225    an alias or false if the caller should use the dependence distances
 226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 227    the depth of the loop described by LOOP_VINFO and the other arguments
 228    are as for vect_analyze_data_ref_dependence.  */
 229
 230 static bool
 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 232                                        loop_vec_info loop_vinfo,
 233                                        int loop_depth, unsigned int *max_vf)
 234 {
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   lambda_vector dist_v;
 237   unsigned int i;
 238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 239     {
 240       int dist = dist_v[loop_depth];
 241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 242         {
 243           /* If the user asserted safelen >= DIST consecutive iterations
 244              can be executed concurrently, assume independence.
 245
 246              ??? An alternative would be to add the alias check even
 247              in this case, and vectorize the fallback loop with the
 248              maximum VF set to safelen.  However, if the user has
 249              explicitly given a length, it's less likely that that
 250              would be a win.  */
 251           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 252             {
 253               if ((unsigned int) loop->safelen < *max_vf)
 254                 *max_vf = loop->safelen;
 255               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 256               continue;
 257             }
 258
 259           /* For dependence distances of 2 or more, we have the option
 260              of limiting VF or checking for an alias at runtime.
 261              Prefer to check at runtime if we can, to avoid limiting
 262              the VF unnecessarily when the bases are in fact independent.
 263
 264              Note that the alias checks will be removed if the VF ends up
 265              being small enough.  */
 266           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 267         }
 268     }
 269   return true;
 270 }
 271
 272
 273 /* Function vect_analyze_data_ref_dependence.
 274
 275    Return TRUE if there (might) exist a dependence between a memory-reference
 276    DRA and a memory-reference DRB.  When versioning for alias may check a
 277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 278    the data dependence.  */
 279
 280 static bool
 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 282                                   loop_vec_info loop_vinfo,
 283                                   unsigned int *max_vf)
 284 {
 285   unsigned int i;
 286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 287   struct data_reference *dra = DDR_A (ddr);
 288   struct data_reference *drb = DDR_B (ddr);
 289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 291   lambda_vector dist_v;
 292   unsigned int loop_depth;
 293
 294   /* In loop analysis all data references should be vectorizable.  */
 295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 297     gcc_unreachable ();
 298
 299   /* Independent data accesses.  */
 300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 301     return false;
 302
 303   if (dra == drb
 304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 305     return false;
 306
 307   /* We do not have to consider dependences between accesses that belong
 308      to the same group.  */
 309   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 310       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 311     return false;
 312
 313   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 314      least two scalar iterations, there is always also a true dependence.
 315      As the vectorizer does not re-order loads and stores we can ignore
 316      the anti-dependence if TBAA can disambiguate both DRs similar to the
 317      case with known negative distance anti-dependences (positive
 318      distance anti-dependences would violate TBAA constraints).  */
 319   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 320        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 321       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 322                                  get_alias_set (DR_REF (drb))))
 323     return false;
 324
 325   /* Unknown data dependence.  */
 326   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 327     {
 328       /* If user asserted safelen consecutive iterations can be
 329          executed concurrently, assume independence.  */
 330       if (loop->safelen >= 2)
 331         {
 332           if ((unsigned int) loop->safelen < *max_vf)
 333             *max_vf = loop->safelen;
 334           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 335           return false;
 336         }
 337
 338       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 339           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 340         {
 341           if (dump_enabled_p ())
 342             {
 343               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 344                                "versioning for alias not supported for: "
 345                                "can't determine dependence between ");
 346               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 347                                  DR_REF (dra));
 348               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 349               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 350                                  DR_REF (drb));
 351               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 352             }
 353           return true;
 354         }
 355
 356       if (dump_enabled_p ())
 357         {
 358           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 359                            "versioning for alias required: "
 360                            "can't determine dependence between ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 362                              DR_REF (dra));
 363           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 364           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 365                              DR_REF (drb));
 366           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 367         }
 368
 369       /* Add to list of ddrs that need to be tested at run-time.  */
 370       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 371     }
 372
 373   /* Known data dependence.  */
 374   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 375     {
 376       /* If user asserted safelen consecutive iterations can be
 377          executed concurrently, assume independence.  */
 378       if (loop->safelen >= 2)
 379         {
 380           if ((unsigned int) loop->safelen < *max_vf)
 381             *max_vf = loop->safelen;
 382           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 383           return false;
 384         }
 385
 386       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 387           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 388         {
 389           if (dump_enabled_p ())
 390             {
 391               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 392                                "versioning for alias not supported for: "
 393                                "bad dist vector for ");
 394               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 395                                  DR_REF (dra));
 396               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 397               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 398                                  DR_REF (drb));
 399               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 400             }
 401           return true;
 402         }
 403
 404       if (dump_enabled_p ())
 405         {
 406           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                            "versioning for alias required: "
 408                            "bad dist vector for ");
 409           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 410           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 411           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 412           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 413         }
 414       /* Add to list of ddrs that need to be tested at run-time.  */
 415       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 416     }
 417
 418   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 419
 420   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 421       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 422                                                 loop_depth, max_vf))
 423     return false;
 424
 425   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 426     {
 427       int dist = dist_v[loop_depth];
 428
 429       if (dump_enabled_p ())
 430         dump_printf_loc (MSG_NOTE, vect_location,
 431                          "dependence distance  = %d.\n", dist);
 432
 433       if (dist == 0)
 434         {
 435           if (dump_enabled_p ())
 436             {
 437               dump_printf_loc (MSG_NOTE, vect_location,
 438                                "dependence distance == 0 between ");
 439               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 440               dump_printf (MSG_NOTE, " and ");
 441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 442               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 443             }
 444
 445           /* When we perform grouped accesses and perform implicit CSE
 446              by detecting equal accesses and doing disambiguation with
 447              runtime alias tests like for
 448                 .. = a[i];
 449                 .. = a[i+1];
 450                 a[i] = ..;
 451                 a[i+1] = ..;
 452                 *p = ..;
 453                 .. = a[i];
 454                 .. = a[i+1];
 455              where we will end up loading { a[i], a[i+1] } once, make
 456              sure that inserting group loads before the first load and
 457              stores after the last store will do the right thing.
 458              Similar for groups like
 459                 a[i] = ...;
 460                 ... = a[i];
 461                 a[i+1] = ...;
 462              where loads from the group interleave with the store.  */
 463           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 464             {
 465               if (dump_enabled_p ())
 466                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 467                                  "READ_WRITE dependence in interleaving.\n");
 468               return true;
 469             }
 470
 471           if (loop->safelen < 2)
 472             {
 473               tree indicator = dr_zero_step_indicator (dra);
 474               if (TREE_CODE (indicator) != INTEGER_CST)
 475                 vect_check_nonzero_value (loop_vinfo, indicator);
 476               else if (integer_zerop (indicator))
 477                 {
 478                   if (dump_enabled_p ())
 479                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                  "access also has a zero step\n");
 481                   return true;
 482                 }
 483             }
 484           continue;
 485         }
 486
 487       if (dist > 0 && DDR_REVERSED_P (ddr))
 488         {
 489           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 490              reversed (to make distance vector positive), and the actual
 491              distance is negative.  */
 492           if (dump_enabled_p ())
 493             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 494                              "dependence distance negative.\n");
 495           /* Record a negative dependence distance to later limit the
 496              amount of stmt copying / unrolling we can perform.
 497              Only need to handle read-after-write dependence.  */
 498           if (DR_IS_READ (drb)
 499               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 500                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 501             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 502           continue;
 503         }
 504
 505       unsigned int abs_dist = abs (dist);
 506       if (abs_dist >= 2 && abs_dist < *max_vf)
 507         {
 508           /* The dependence distance requires reduction of the maximal
 509              vectorization factor.  */
 510           *max_vf = abs (dist);
 511           if (dump_enabled_p ())
 512             dump_printf_loc (MSG_NOTE, vect_location,
 513                              "adjusting maximal vectorization factor to %i\n",
 514                              *max_vf);
 515         }
 516
 517       if (abs_dist >= *max_vf)
 518         {
 519           /* Dependence distance does not create dependence, as far as
 520              vectorization is concerned, in this case.  */
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "dependence distance >= VF.\n");
 524           continue;
 525         }
 526
 527       if (dump_enabled_p ())
 528         {
 529           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 530                        "not vectorized, possible dependence "
 531                        "between data-refs ");
 532           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 533           dump_printf (MSG_NOTE,  " and ");
 534           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 535           dump_printf (MSG_NOTE,  "\n");
 536         }
 537
 538       return true;
 539     }
 540
 541   return false;
 542 }
 543
 544 /* Function vect_analyze_data_ref_dependences.
 545
 546    Examine all the data references in the loop, and make sure there do not
 547    exist any data dependences between them.  Set *MAX_VF according to
 548    the maximum vectorization factor the data dependences allow.  */
 549
 550 bool
 551 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 552                                    unsigned int *max_vf)
 553 {
 554   unsigned int i;
 555   struct data_dependence_relation *ddr;
 556
 557   if (dump_enabled_p ())
 558     dump_printf_loc (MSG_NOTE, vect_location,
 559                      "=== vect_analyze_data_ref_dependences ===\n");
 560
 561   LOOP_VINFO_DDRS (loop_vinfo)
 562     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 563              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 564   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 565   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 566   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 567                                 &LOOP_VINFO_DDRS (loop_vinfo),
 568                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 569     return false;
 570
 571   /* For epilogues we either have no aliases or alias versioning
 572      was applied to original loop.  Therefore we may just get max_vf
 573      using VF of original loop.  */
 574   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 575     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 576   else
 577     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 578       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 579         return false;
 580
 581   return true;
 582 }
 583
 584
 585 /* Function vect_slp_analyze_data_ref_dependence.
 586
 587    Return TRUE if there (might) exist a dependence between a memory-reference
 588    DRA and a memory-reference DRB.  When versioning for alias may check a
 589    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 590    the data dependence.  */
 591
 592 static bool
 593 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 594 {
 595   struct data_reference *dra = DDR_A (ddr);
 596   struct data_reference *drb = DDR_B (ddr);
 597
 598   /* We need to check dependences of statements marked as unvectorizable
 599      as well, they still can prohibit vectorization.  */
 600
 601   /* Independent data accesses.  */
 602   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 603     return false;
 604
 605   if (dra == drb)
 606     return false;
 607
 608   /* Read-read is OK.  */
 609   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 610     return false;
 611
 612   /* If dra and drb are part of the same interleaving chain consider
 613      them independent.  */
 614   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 615       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 616           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 617     return false;
 618
 619   /* Unknown data dependence.  */
 620   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 621     {
 622       if  (dump_enabled_p ())
 623         {
 624           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 625                            "can't determine dependence between ");
 626           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 627           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 628           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 629           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 630         }
 631     }
 632   else if (dump_enabled_p ())
 633     {
 634       dump_printf_loc (MSG_NOTE, vect_location,
 635                        "determined dependence between ");
 636       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 637       dump_printf (MSG_NOTE, " and ");
 638       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 639       dump_printf (MSG_NOTE,  "\n");
 640     }
 641
 642   return true;
 643 }
 644
 645
 646 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 647    contain the vector of scalar stores of this instance if we are
 648    disambiguating the loads.  */
 649
 650 static bool
 651 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 652                                    vec<gimple *> stores, gimple *last_store)
 653 {
 654   /* This walks over all stmts involved in the SLP load/store done
 655      in NODE verifying we can sink them up to the last stmt in the
 656      group.  */
 657   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 658   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 659     {
 660       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 661       if (access == last_access)
 662         continue;
 663       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 664       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 665            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 666         {
 667           gimple *stmt = gsi_stmt (gsi);
 668           if (! gimple_vuse (stmt)
 669               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 670             continue;
 671
 672           /* If we couldn't record a (single) data reference for this
 673              stmt we have to give up.  */
 674           /* ???  Here and below if dependence analysis fails we can resort
 675              to the alias oracle which can handle more kinds of stmts.  */
 676           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 677           if (!dr_b)
 678             return false;
 679
 680           bool dependent = false;
 681           /* If we run into a store of this same instance (we've just
 682              marked those) then delay dependence checking until we run
 683              into the last store because this is where it will have
 684              been sunk to (and we verify if we can do that as well).  */
 685           if (gimple_visited_p (stmt))
 686             {
 687               if (stmt != last_store)
 688                 continue;
 689               unsigned i;
 690               gimple *store;
 691               FOR_EACH_VEC_ELT (stores, i, store)
 692                 {
 693                   data_reference *store_dr
 694                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 695                   ddr_p ddr = initialize_data_dependence_relation
 696                                 (dr_a, store_dr, vNULL);
 697                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 698                   free_dependence_relation (ddr);
 699                   if (dependent)
 700                     break;
 701                 }
 702             }
 703           else
 704             {
 705               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 706                                                                dr_b, vNULL);
 707               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 708               free_dependence_relation (ddr);
 709             }
 710           if (dependent)
 711             return false;
 712         }
 713     }
 714   return true;
 715 }
 716
 717
 718 /* Function vect_analyze_data_ref_dependences.
 719
 720    Examine all the data references in the basic-block, and make sure there
 721    do not exist any data dependences between them.  Set *MAX_VF according to
 722    the maximum vectorization factor the data dependences allow.  */
 723
 724 bool
 725 vect_slp_analyze_instance_dependence (slp_instance instance)
 726 {
 727   if (dump_enabled_p ())
 728     dump_printf_loc (MSG_NOTE, vect_location,
 729                      "=== vect_slp_analyze_instance_dependence ===\n");
 730
 731   /* The stores of this instance are at the root of the SLP tree.  */
 732   slp_tree store = SLP_INSTANCE_TREE (instance);
 733   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 734     store = NULL;
 735
 736   /* Verify we can sink stores to the vectorized stmt insert location.  */
 737   gimple *last_store = NULL;
 738   if (store)
 739     {
 740       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 741         return false;
 742
 743       /* Mark stores in this instance and remember the last one.  */
 744       last_store = vect_find_last_scalar_stmt_in_slp (store);
 745       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 746         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 747     }
 748
 749   bool res = true;
 750
 751   /* Verify we can sink loads to the vectorized stmt insert location,
 752      special-casing stores of this instance.  */
 753   slp_tree load;
 754   unsigned int i;
 755   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 756     if (! vect_slp_analyze_node_dependences (instance, load,
 757                                              store
 758                                              ? SLP_TREE_SCALAR_STMTS (store)
 759                                              : vNULL, last_store))
 760       {
 761         res = false;
 762         break;
 763       }
 764
 765   /* Unset the visited flag.  */
 766   if (store)
 767     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 768       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 769
 770   return res;
 771 }
 772
 773 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 774    the statement that contains DRB, which is useful for recording in the
 775    dump file.  */
 776
 777 static void
 778 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 779                             innermost_loop_behavior *drb)
 780 {
 781   bool existed;
 782   innermost_loop_behavior *&entry
 783     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 784   if (!existed || entry->base_alignment < drb->base_alignment)
 785     {
 786       entry = drb;
 787       if (dump_enabled_p ())
 788         {
 789           dump_printf_loc (MSG_NOTE, vect_location,
 790                            "recording new base alignment for ");
 791           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 792           dump_printf (MSG_NOTE, "\n");
 793           dump_printf_loc (MSG_NOTE, vect_location,
 794                            "  alignment:    %d\n", drb->base_alignment);
 795           dump_printf_loc (MSG_NOTE, vect_location,
 796                            "  misalignment: %d\n", drb->base_misalignment);
 797           dump_printf_loc (MSG_NOTE, vect_location,
 798                            "  based on:     ");
 799           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 800         }
 801     }
 802 }
 803
 804 /* If the region we're going to vectorize is reached, all unconditional
 805    data references occur at least once.  We can therefore pool the base
 806    alignment guarantees from each unconditional reference.  Do this by
 807    going through all the data references in VINFO and checking whether
 808    the containing statement makes the reference unconditionally.  If so,
 809    record the alignment of the base address in VINFO so that it can be
 810    used for all other references with the same base.  */
 811
 812 void
 813 vect_record_base_alignments (vec_info *vinfo)
 814 {
 815   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 816   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 817   data_reference *dr;
 818   unsigned int i;
 819   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 820     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 821       {
 822         gimple *stmt = DR_STMT (dr);
 823         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 824
 825         /* If DR is nested in the loop that is being vectorized, we can also
 826            record the alignment of the base wrt the outer loop.  */
 827         if (loop && nested_in_vect_loop_p (loop, stmt))
 828           {
 829             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 830             vect_record_base_alignment
 831               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 832           }
 833       }
 834 }
 835
 836 /* Return the target alignment for the vectorized form of DR.  */
 837
 838 static unsigned int
 839 vect_calculate_target_alignment (struct data_reference *dr)
 840 {
 841   gimple *stmt = DR_STMT (dr);
 842   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 843   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 844   return targetm.vectorize.preferred_vector_alignment (vectype);
 845 }
 846
 847 /* Function vect_compute_data_ref_alignment
 848
 849    Compute the misalignment of the data reference DR.
 850
 851    Output:
 852    1. If during the misalignment computation it is found that the data reference
 853       cannot be vectorized then false is returned.
 854    2. DR_MISALIGNMENT (DR) is defined.
 855
 856    FOR NOW: No analysis is actually performed. Misalignment is calculated
 857    only for trivial cases. TODO.  */
 858
 859 bool
 860 vect_compute_data_ref_alignment (struct data_reference *dr)
 861 {
 862   gimple *stmt = DR_STMT (dr);
 863   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 864   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 865   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 866   struct loop *loop = NULL;
 867   tree ref = DR_REF (dr);
 868   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 869
 870   if (dump_enabled_p ())
 871     dump_printf_loc (MSG_NOTE, vect_location,
 872                      "vect_compute_data_ref_alignment:\n");
 873
 874   if (loop_vinfo)
 875     loop = LOOP_VINFO_LOOP (loop_vinfo);
 876
 877   /* Initialize misalignment to unknown.  */
 878   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 879
 880   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 881   bool step_preserves_misalignment_p;
 882
 883   unsigned HOST_WIDE_INT vector_alignment
 884     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 885   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 886
 887   /* No step for BB vectorization.  */
 888   if (!loop)
 889     {
 890       gcc_assert (integer_zerop (drb->step));
 891       step_preserves_misalignment_p = true;
 892     }
 893
 894   /* In case the dataref is in an inner-loop of the loop that is being
 895      vectorized (LOOP), we use the base and misalignment information
 896      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 897      stays the same throughout the execution of the inner-loop, which is why
 898      we have to check that the stride of the dataref in the inner-loop evenly
 899      divides by the vector alignment.  */
 900   else if (nested_in_vect_loop_p (loop, stmt))
 901     {
 902       step_preserves_misalignment_p
 903         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 904
 905       if (dump_enabled_p ())
 906         {
 907           if (step_preserves_misalignment_p)
 908             dump_printf_loc (MSG_NOTE, vect_location,
 909                              "inner step divides the vector alignment.\n");
 910           else
 911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                              "inner step doesn't divide the vector"
 913                              " alignment.\n");
 914         }
 915     }
 916
 917   /* Similarly we can only use base and misalignment information relative to
 918      an innermost loop if the misalignment stays the same throughout the
 919      execution of the loop.  As above, this is the case if the stride of
 920      the dataref evenly divides by the alignment.  */
 921   else
 922     {
 923       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 924       step_preserves_misalignment_p
 925         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 926
 927       if (!step_preserves_misalignment_p && dump_enabled_p ())
 928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 929                          "step doesn't divide the vector alignment.\n");
 930     }
 931
 932   unsigned int base_alignment = drb->base_alignment;
 933   unsigned int base_misalignment = drb->base_misalignment;
 934
 935   /* Calculate the maximum of the pooled base address alignment and the
 936      alignment that we can compute for DR itself.  */
 937   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 938   if (entry && base_alignment < (*entry)->base_alignment)
 939     {
 940       base_alignment = (*entry)->base_alignment;
 941       base_misalignment = (*entry)->base_misalignment;
 942     }
 943
 944   if (drb->offset_alignment < vector_alignment
 945       || !step_preserves_misalignment_p
 946       /* We need to know whether the step wrt the vectorized loop is
 947          negative when computing the starting misalignment below.  */
 948       || TREE_CODE (drb->step) != INTEGER_CST)
 949     {
 950       if (dump_enabled_p ())
 951         {
 952           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 953                            "Unknown alignment for access: ");
 954           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 955           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 956         }
 957       return true;
 958     }
 959
 960   if (base_alignment < vector_alignment)
 961     {
 962       unsigned int max_alignment;
 963       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 964       if (max_alignment < vector_alignment
 965           || !vect_can_force_dr_alignment_p (base,
 966                                              vector_alignment * BITS_PER_UNIT))
 967         {
 968           if (dump_enabled_p ())
 969             {
 970               dump_printf_loc (MSG_NOTE, vect_location,
 971                                "can't force alignment of ref: ");
 972               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 973               dump_printf (MSG_NOTE, "\n");
 974             }
 975           return true;
 976         }
 977
 978       /* Force the alignment of the decl.
 979          NOTE: This is the only change to the code we make during
 980          the analysis phase, before deciding to vectorize the loop.  */
 981       if (dump_enabled_p ())
 982         {
 983           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 984           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 985           dump_printf (MSG_NOTE, "\n");
 986         }
 987
 988       DR_VECT_AUX (dr)->base_decl = base;
 989       DR_VECT_AUX (dr)->base_misaligned = true;
 990       base_misalignment = 0;
 991     }
 992   poly_int64 misalignment
 993     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 994
 995   /* If this is a backward running DR then first access in the larger
 996      vectype actually is N-1 elements before the address in the DR.
 997      Adjust misalign accordingly.  */
 998   if (tree_int_cst_sgn (drb->step) < 0)
 999     /* PLUS because STEP is negative.  */
1000     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1001                      * TREE_INT_CST_LOW (drb->step));
1002
1003   unsigned int const_misalignment;
1004   if (!known_misalignment (misalignment, vector_alignment,
1005                            &const_misalignment))
1006     {
1007       if (dump_enabled_p ())
1008         {
1009           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                            "Non-constant misalignment for access: ");
1011           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1012           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1013         }
1014       return true;
1015     }
1016
1017   SET_DR_MISALIGNMENT (dr, const_misalignment);
1018
1019   if (dump_enabled_p ())
1020     {
1021       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1023       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1025     }
1026
1027   return true;
1028 }
1029
1030 /* Function vect_update_misalignment_for_peel.
1031    Sets DR's misalignment
1032    - to 0 if it has the same alignment as DR_PEEL,
1033    - to the misalignment computed using NPEEL if DR's salignment is known,
1034    - to -1 (unknown) otherwise.
1035
1036    DR - the data reference whose misalignment is to be adjusted.
1037    DR_PEEL - the data reference whose misalignment is being made
1038              zero in the vector loop by the peel.
1039    NPEEL - the number of iterations in the peel loop if the misalignment
1040            of DR_PEEL is known at compile time.  */
1041
1042 static void
1043 vect_update_misalignment_for_peel (struct data_reference *dr,
1044                                    struct data_reference *dr_peel, int npeel)
1045 {
1046   unsigned int i;
1047   vec<dr_p> same_aligned_drs;
1048   struct data_reference *current_dr;
1049   int dr_size = vect_get_scalar_dr_size (dr);
1050   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1051   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1052   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1053
1054  /* For interleaved data accesses the step in the loop must be multiplied by
1055      the size of the interleaving group.  */
1056   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1057     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1058   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1059     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1060
1061   /* It can be assumed that the data refs with the same alignment as dr_peel
1062      are aligned in the vector loop.  */
1063   same_aligned_drs
1064     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1065   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1066     {
1067       if (current_dr != dr)
1068         continue;
1069       gcc_assert (!known_alignment_for_access_p (dr)
1070                   || !known_alignment_for_access_p (dr_peel)
1071                   || (DR_MISALIGNMENT (dr) / dr_size
1072                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1073       SET_DR_MISALIGNMENT (dr, 0);
1074       return;
1075     }
1076
1077   if (known_alignment_for_access_p (dr)
1078       && known_alignment_for_access_p (dr_peel))
1079     {
1080       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1081       int misal = DR_MISALIGNMENT (dr);
1082       misal += negative ? -npeel * dr_size : npeel * dr_size;
1083       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1084       SET_DR_MISALIGNMENT (dr, misal);
1085       return;
1086     }
1087
1088   if (dump_enabled_p ())
1089     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1090                      "to unknown (-1).\n");
1091   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1092 }
1093
1094
1095 /* Function verify_data_ref_alignment
1096
1097    Return TRUE if DR can be handled with respect to alignment.  */
1098
1099 static bool
1100 verify_data_ref_alignment (data_reference_p dr)
1101 {
1102   enum dr_alignment_support supportable_dr_alignment
1103     = vect_supportable_dr_alignment (dr, false);
1104   if (!supportable_dr_alignment)
1105     {
1106       if (dump_enabled_p ())
1107         {
1108           if (DR_IS_READ (dr))
1109             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110                              "not vectorized: unsupported unaligned load.");
1111           else
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "not vectorized: unsupported unaligned "
1114                              "store.");
1115
1116           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1117                              DR_REF (dr));
1118           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1119         }
1120       return false;
1121     }
1122
1123   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1124     dump_printf_loc (MSG_NOTE, vect_location,
1125                      "Vectorizing an unaligned access.\n");
1126
1127   return true;
1128 }
1129
1130 /* Function vect_verify_datarefs_alignment
1131
1132    Return TRUE if all data references in the loop can be
1133    handled with respect to alignment.  */
1134
1135 bool
1136 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1137 {
1138   vec<data_reference_p> datarefs = vinfo->datarefs;
1139   struct data_reference *dr;
1140   unsigned int i;
1141
1142   FOR_EACH_VEC_ELT (datarefs, i, dr)
1143     {
1144       gimple *stmt = DR_STMT (dr);
1145       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1146
1147       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1148         continue;
1149
1150       /* For interleaving, only the alignment of the first access matters.   */
1151       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1152           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1153         continue;
1154
1155       /* Strided accesses perform only component accesses, alignment is
1156          irrelevant for them.  */
1157       if (STMT_VINFO_STRIDED_P (stmt_info)
1158           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1159         continue;
1160
1161       if (! verify_data_ref_alignment (dr))
1162         return false;
1163     }
1164
1165   return true;
1166 }
1167
1168 /* Given an memory reference EXP return whether its alignment is less
1169    than its size.  */
1170
1171 static bool
1172 not_size_aligned (tree exp)
1173 {
1174   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1175     return true;
1176
1177   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1178           > get_object_alignment (exp));
1179 }
1180
1181 /* Function vector_alignment_reachable_p
1182
1183    Return true if vector alignment for DR is reachable by peeling
1184    a few loop iterations.  Return false otherwise.  */
1185
1186 static bool
1187 vector_alignment_reachable_p (struct data_reference *dr)
1188 {
1189   gimple *stmt = DR_STMT (dr);
1190   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1191   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1192
1193   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1194     {
1195       /* For interleaved access we peel only if number of iterations in
1196          the prolog loop ({VF - misalignment}), is a multiple of the
1197          number of the interleaved accesses.  */
1198       int elem_size, mis_in_elements;
1199
1200       /* FORNOW: handle only known alignment.  */
1201       if (!known_alignment_for_access_p (dr))
1202         return false;
1203
1204       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1205       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1206       elem_size = vector_element_size (vector_size, nelements);
1207       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1208
1209       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1210         return false;
1211     }
1212
1213   /* If misalignment is known at the compile time then allow peeling
1214      only if natural alignment is reachable through peeling.  */
1215   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1216     {
1217       HOST_WIDE_INT elmsize =
1218                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1219       if (dump_enabled_p ())
1220         {
1221           dump_printf_loc (MSG_NOTE, vect_location,
1222                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1223           dump_printf (MSG_NOTE,
1224                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1225         }
1226       if (DR_MISALIGNMENT (dr) % elmsize)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "data size does not divide the misalignment.\n");
1231           return false;
1232         }
1233     }
1234
1235   if (!known_alignment_for_access_p (dr))
1236     {
1237       tree type = TREE_TYPE (DR_REF (dr));
1238       bool is_packed = not_size_aligned (DR_REF (dr));
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241                          "Unknown misalignment, %snaturally aligned\n",
1242                          is_packed ? "not " : "");
1243       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1244     }
1245
1246   return true;
1247 }
1248
1249
1250 /* Calculate the cost of the memory access represented by DR.  */
1251
1252 static void
1253 vect_get_data_access_cost (struct data_reference *dr,
1254                            unsigned int *inside_cost,
1255                            unsigned int *outside_cost,
1256                            stmt_vector_for_cost *body_cost_vec,
1257                            stmt_vector_for_cost *prologue_cost_vec)
1258 {
1259   gimple *stmt = DR_STMT (dr);
1260   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1261   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1262   int ncopies;
1263
1264   if (PURE_SLP_STMT (stmt_info))
1265     ncopies = 1;
1266   else
1267     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1268
1269   if (DR_IS_READ (dr))
1270     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1271                         prologue_cost_vec, body_cost_vec, false);
1272   else
1273     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1274
1275   if (dump_enabled_p ())
1276     dump_printf_loc (MSG_NOTE, vect_location,
1277                      "vect_get_data_access_cost: inside_cost = %d, "
1278                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1279 }
1280
1281
1282 typedef struct _vect_peel_info
1283 {
1284   struct data_reference *dr;
1285   int npeel;
1286   unsigned int count;
1287 } *vect_peel_info;
1288
1289 typedef struct _vect_peel_extended_info
1290 {
1291   struct _vect_peel_info peel_info;
1292   unsigned int inside_cost;
1293   unsigned int outside_cost;
1294 } *vect_peel_extended_info;
1295
1296
1297 /* Peeling hashtable helpers.  */
1298
1299 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1300 {
1301   static inline hashval_t hash (const _vect_peel_info *);
1302   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1303 };
1304
1305 inline hashval_t
1306 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1307 {
1308   return (hashval_t) peel_info->npeel;
1309 }
1310
1311 inline bool
1312 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1313 {
1314   return (a->npeel == b->npeel);
1315 }
1316
1317
1318 /* Insert DR into peeling hash table with NPEEL as key.  */
1319
1320 static void
1321 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1322                           loop_vec_info loop_vinfo, struct data_reference *dr,
1323                           int npeel)
1324 {
1325   struct _vect_peel_info elem, *slot;
1326   _vect_peel_info **new_slot;
1327   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1328
1329   elem.npeel = npeel;
1330   slot = peeling_htab->find (&elem);
1331   if (slot)
1332     slot->count++;
1333   else
1334     {
1335       slot = XNEW (struct _vect_peel_info);
1336       slot->npeel = npeel;
1337       slot->dr = dr;
1338       slot->count = 1;
1339       new_slot = peeling_htab->find_slot (slot, INSERT);
1340       *new_slot = slot;
1341     }
1342
1343   if (!supportable_dr_alignment
1344       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1345     slot->count += VECT_MAX_COST;
1346 }
1347
1348
1349 /* Traverse peeling hash table to find peeling option that aligns maximum
1350    number of data accesses.  */
1351
1352 int
1353 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1354                                      _vect_peel_extended_info *max)
1355 {
1356   vect_peel_info elem = *slot;
1357
1358   if (elem->count > max->peel_info.count
1359       || (elem->count == max->peel_info.count
1360           && max->peel_info.npeel > elem->npeel))
1361     {
1362       max->peel_info.npeel = elem->npeel;
1363       max->peel_info.count = elem->count;
1364       max->peel_info.dr = elem->dr;
1365     }
1366
1367   return 1;
1368 }
1369
1370 /* Get the costs of peeling NPEEL iterations checking data access costs
1371    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1372    misalignment will be zero after peeling.  */
1373
1374 static void
1375 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1376                                 struct data_reference *dr0,
1377                                 unsigned int *inside_cost,
1378                                 unsigned int *outside_cost,
1379                                 stmt_vector_for_cost *body_cost_vec,
1380                                 stmt_vector_for_cost *prologue_cost_vec,
1381                                 unsigned int npeel,
1382                                 bool unknown_misalignment)
1383 {
1384   unsigned i;
1385   data_reference *dr;
1386
1387   FOR_EACH_VEC_ELT (datarefs, i, dr)
1388     {
1389       gimple *stmt = DR_STMT (dr);
1390       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1392         continue;
1393
1394       /* For interleaving, only the alignment of the first access
1395          matters.  */
1396       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1397           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1398         continue;
1399
1400       /* Strided accesses perform only component accesses, alignment is
1401          irrelevant for them.  */
1402       if (STMT_VINFO_STRIDED_P (stmt_info)
1403           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1404         continue;
1405
1406       int save_misalignment;
1407       save_misalignment = DR_MISALIGNMENT (dr);
1408       if (npeel == 0)
1409         ;
1410       else if (unknown_misalignment && dr == dr0)
1411         SET_DR_MISALIGNMENT (dr, 0);
1412       else
1413         vect_update_misalignment_for_peel (dr, dr0, npeel);
1414       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1415                                  body_cost_vec, prologue_cost_vec);
1416       SET_DR_MISALIGNMENT (dr, save_misalignment);
1417     }
1418 }
1419
1420 /* Traverse peeling hash table and calculate cost for each peeling option.
1421    Find the one with the lowest cost.  */
1422
1423 int
1424 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1425                                    _vect_peel_extended_info *min)
1426 {
1427   vect_peel_info elem = *slot;
1428   int dummy;
1429   unsigned int inside_cost = 0, outside_cost = 0;
1430   gimple *stmt = DR_STMT (elem->dr);
1431   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1432   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1433   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1434                        epilogue_cost_vec;
1435
1436   prologue_cost_vec.create (2);
1437   body_cost_vec.create (2);
1438   epilogue_cost_vec.create (2);
1439
1440   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1441                                   elem->dr, &inside_cost, &outside_cost,
1442                                   &body_cost_vec, &prologue_cost_vec,
1443                                   elem->npeel, false);
1444
1445   body_cost_vec.release ();
1446
1447   outside_cost += vect_get_known_peeling_cost
1448     (loop_vinfo, elem->npeel, &dummy,
1449      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1450      &prologue_cost_vec, &epilogue_cost_vec);
1451
1452   /* Prologue and epilogue costs are added to the target model later.
1453      These costs depend only on the scalar iteration cost, the
1454      number of peeling iterations finally chosen, and the number of
1455      misaligned statements.  So discard the information found here.  */
1456   prologue_cost_vec.release ();
1457   epilogue_cost_vec.release ();
1458
1459   if (inside_cost < min->inside_cost
1460       || (inside_cost == min->inside_cost
1461           && outside_cost < min->outside_cost))
1462     {
1463       min->inside_cost = inside_cost;
1464       min->outside_cost = outside_cost;
1465       min->peel_info.dr = elem->dr;
1466       min->peel_info.npeel = elem->npeel;
1467       min->peel_info.count = elem->count;
1468     }
1469
1470   return 1;
1471 }
1472
1473
1474 /* Choose best peeling option by traversing peeling hash table and either
1475    choosing an option with the lowest cost (if cost model is enabled) or the
1476    option that aligns as many accesses as possible.  */
1477
1478 static struct _vect_peel_extended_info
1479 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1480                                        loop_vec_info loop_vinfo)
1481 {
1482    struct _vect_peel_extended_info res;
1483
1484    res.peel_info.dr = NULL;
1485
1486    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1487      {
1488        res.inside_cost = INT_MAX;
1489        res.outside_cost = INT_MAX;
1490        peeling_htab->traverse <_vect_peel_extended_info *,
1491                                vect_peeling_hash_get_lowest_cost> (&res);
1492      }
1493    else
1494      {
1495        res.peel_info.count = 0;
1496        peeling_htab->traverse <_vect_peel_extended_info *,
1497                                vect_peeling_hash_get_most_frequent> (&res);
1498        res.inside_cost = 0;
1499        res.outside_cost = 0;
1500      }
1501
1502    return res;
1503 }
1504
1505 /* Return true if the new peeling NPEEL is supported.  */
1506
1507 static bool
1508 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1509                           unsigned npeel)
1510 {
1511   unsigned i;
1512   struct data_reference *dr = NULL;
1513   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1514   gimple *stmt;
1515   stmt_vec_info stmt_info;
1516   enum dr_alignment_support supportable_dr_alignment;
1517
1518   /* Ensure that all data refs can be vectorized after the peel.  */
1519   FOR_EACH_VEC_ELT (datarefs, i, dr)
1520     {
1521       int save_misalignment;
1522
1523       if (dr == dr0)
1524         continue;
1525
1526       stmt = DR_STMT (dr);
1527       stmt_info = vinfo_for_stmt (stmt);
1528       /* For interleaving, only the alignment of the first access
1529          matters.  */
1530       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1531           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1532         continue;
1533
1534       /* Strided accesses perform only component accesses, alignment is
1535          irrelevant for them.  */
1536       if (STMT_VINFO_STRIDED_P (stmt_info)
1537           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1538         continue;
1539
1540       save_misalignment = DR_MISALIGNMENT (dr);
1541       vect_update_misalignment_for_peel (dr, dr0, npeel);
1542       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1543       SET_DR_MISALIGNMENT (dr, save_misalignment);
1544
1545       if (!supportable_dr_alignment)
1546         return false;
1547     }
1548
1549   return true;
1550 }
1551
1552 /* Function vect_enhance_data_refs_alignment
1553
1554    This pass will use loop versioning and loop peeling in order to enhance
1555    the alignment of data references in the loop.
1556
1557    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1558    original loop is to be vectorized.  Any other loops that are created by
1559    the transformations performed in this pass - are not supposed to be
1560    vectorized.  This restriction will be relaxed.
1561
1562    This pass will require a cost model to guide it whether to apply peeling
1563    or versioning or a combination of the two.  For example, the scheme that
1564    intel uses when given a loop with several memory accesses, is as follows:
1565    choose one memory access ('p') which alignment you want to force by doing
1566    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1567    other accesses are not necessarily aligned, or (2) use loop versioning to
1568    generate one loop in which all accesses are aligned, and another loop in
1569    which only 'p' is necessarily aligned.
1570
1571    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1572    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1573    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1574
1575    Devising a cost model is the most critical aspect of this work.  It will
1576    guide us on which access to peel for, whether to use loop versioning, how
1577    many versions to create, etc.  The cost model will probably consist of
1578    generic considerations as well as target specific considerations (on
1579    powerpc for example, misaligned stores are more painful than misaligned
1580    loads).
1581
1582    Here are the general steps involved in alignment enhancements:
1583
1584      -- original loop, before alignment analysis:
1585         for (i=0; i<N; i++){
1586           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1587           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1588         }
1589
1590      -- After vect_compute_data_refs_alignment:
1591         for (i=0; i<N; i++){
1592           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1593           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1594         }
1595
1596      -- Possibility 1: we do loop versioning:
1597      if (p is aligned) {
1598         for (i=0; i<N; i++){    # loop 1A
1599           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1600           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1601         }
1602      }
1603      else {
1604         for (i=0; i<N; i++){    # loop 1B
1605           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1606           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1607         }
1608      }
1609
1610      -- Possibility 2: we do loop peeling:
1611      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1612         x = q[i];
1613         p[i] = y;
1614      }
1615      for (i = 3; i < N; i++){   # loop 2A
1616         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1617         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1618      }
1619
1620      -- Possibility 3: combination of loop peeling and versioning:
1621      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1622         x = q[i];
1623         p[i] = y;
1624      }
1625      if (p is aligned) {
1626         for (i = 3; i<N; i++){  # loop 3A
1627           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1628           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1629         }
1630      }
1631      else {
1632         for (i = 3; i<N; i++){  # loop 3B
1633           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1634           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1635         }
1636      }
1637
1638      These loops are later passed to loop_transform to be vectorized.  The
1639      vectorizer will use the alignment information to guide the transformation
1640      (whether to generate regular loads/stores, or with special handling for
1641      misalignment).  */
1642
1643 bool
1644 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1645 {
1646   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1647   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1648   enum dr_alignment_support supportable_dr_alignment;
1649   struct data_reference *dr0 = NULL, *first_store = NULL;
1650   struct data_reference *dr;
1651   unsigned int i, j;
1652   bool do_peeling = false;
1653   bool do_versioning = false;
1654   bool stat;
1655   gimple *stmt;
1656   stmt_vec_info stmt_info;
1657   unsigned int npeel = 0;
1658   bool one_misalignment_known = false;
1659   bool one_misalignment_unknown = false;
1660   bool one_dr_unsupportable = false;
1661   struct data_reference *unsupportable_dr = NULL;
1662   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1663   unsigned possible_npeel_number = 1;
1664   tree vectype;
1665   unsigned int mis, same_align_drs_max = 0;
1666   hash_table<peel_info_hasher> peeling_htab (1);
1667
1668   if (dump_enabled_p ())
1669     dump_printf_loc (MSG_NOTE, vect_location,
1670                      "=== vect_enhance_data_refs_alignment ===\n");
1671
1672   /* Reset data so we can safely be called multiple times.  */
1673   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1674   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1675
1676   /* While cost model enhancements are expected in the future, the high level
1677      view of the code at this time is as follows:
1678
1679      A) If there is a misaligned access then see if peeling to align
1680         this access can make all data references satisfy
1681         vect_supportable_dr_alignment.  If so, update data structures
1682         as needed and return true.
1683
1684      B) If peeling wasn't possible and there is a data reference with an
1685         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1686         then see if loop versioning checks can be used to make all data
1687         references satisfy vect_supportable_dr_alignment.  If so, update
1688         data structures as needed and return true.
1689
1690      C) If neither peeling nor versioning were successful then return false if
1691         any data reference does not satisfy vect_supportable_dr_alignment.
1692
1693      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1694
1695      Note, Possibility 3 above (which is peeling and versioning together) is not
1696      being done at this time.  */
1697
1698   /* (1) Peeling to force alignment.  */
1699
1700   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1701      Considerations:
1702      + How many accesses will become aligned due to the peeling
1703      - How many accesses will become unaligned due to the peeling,
1704        and the cost of misaligned accesses.
1705      - The cost of peeling (the extra runtime checks, the increase
1706        in code size).  */
1707
1708   FOR_EACH_VEC_ELT (datarefs, i, dr)
1709     {
1710       stmt = DR_STMT (dr);
1711       stmt_info = vinfo_for_stmt (stmt);
1712
1713       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1714         continue;
1715
1716       /* For interleaving, only the alignment of the first access
1717          matters.  */
1718       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1719           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1720         continue;
1721
1722       /* For invariant accesses there is nothing to enhance.  */
1723       if (integer_zerop (DR_STEP (dr)))
1724         continue;
1725
1726       /* Strided accesses perform only component accesses, alignment is
1727          irrelevant for them.  */
1728       if (STMT_VINFO_STRIDED_P (stmt_info)
1729           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1730         continue;
1731
1732       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1733       do_peeling = vector_alignment_reachable_p (dr);
1734       if (do_peeling)
1735         {
1736           if (known_alignment_for_access_p (dr))
1737             {
1738               unsigned int npeel_tmp = 0;
1739               bool negative = tree_int_cst_compare (DR_STEP (dr),
1740                                                     size_zero_node) < 0;
1741
1742               vectype = STMT_VINFO_VECTYPE (stmt_info);
1743               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1744               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1745               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1746               if (DR_MISALIGNMENT (dr) != 0)
1747                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1748
1749               /* For multiple types, it is possible that the bigger type access
1750                  will have more than one peeling option.  E.g., a loop with two
1751                  types: one of size (vector size / 4), and the other one of
1752                  size (vector size / 8).  Vectorization factor will 8.  If both
1753                  accesses are misaligned by 3, the first one needs one scalar
1754                  iteration to be aligned, and the second one needs 5.  But the
1755                  first one will be aligned also by peeling 5 scalar
1756                  iterations, and in that case both accesses will be aligned.
1757                  Hence, except for the immediate peeling amount, we also want
1758                  to try to add full vector size, while we don't exceed
1759                  vectorization factor.
1760                  We do this automatically for cost model, since we calculate
1761                  cost for every peeling option.  */
1762               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1763                 {
1764                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1765                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1766                   possible_npeel_number
1767                     = vect_get_num_vectors (nscalars, vectype);
1768
1769                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1770                      allow peeling NELEMENTS.  */
1771                   if (DR_MISALIGNMENT (dr) == 0)
1772                     possible_npeel_number++;
1773                 }
1774
1775               /* Save info about DR in the hash table.  Also include peeling
1776                  amounts according to the explanation above.  */
1777               for (j = 0; j < possible_npeel_number; j++)
1778                 {
1779                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1780                                             dr, npeel_tmp);
1781                   npeel_tmp += target_align / dr_size;
1782                 }
1783
1784               one_misalignment_known = true;
1785             }
1786           else
1787             {
1788               /* If we don't know any misalignment values, we prefer
1789                  peeling for data-ref that has the maximum number of data-refs
1790                  with the same alignment, unless the target prefers to align
1791                  stores over load.  */
1792               unsigned same_align_drs
1793                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1794               if (!dr0
1795                   || same_align_drs_max < same_align_drs)
1796                 {
1797                   same_align_drs_max = same_align_drs;
1798                   dr0 = dr;
1799                 }
1800               /* For data-refs with the same number of related
1801                  accesses prefer the one where the misalign
1802                  computation will be invariant in the outermost loop.  */
1803               else if (same_align_drs_max == same_align_drs)
1804                 {
1805                   struct loop *ivloop0, *ivloop;
1806                   ivloop0 = outermost_invariant_loop_for_expr
1807                     (loop, DR_BASE_ADDRESS (dr0));
1808                   ivloop = outermost_invariant_loop_for_expr
1809                     (loop, DR_BASE_ADDRESS (dr));
1810                   if ((ivloop && !ivloop0)
1811                       || (ivloop && ivloop0
1812                           && flow_loop_nested_p (ivloop, ivloop0)))
1813                     dr0 = dr;
1814                 }
1815
1816               one_misalignment_unknown = true;
1817
1818               /* Check for data refs with unsupportable alignment that
1819                  can be peeled.  */
1820               if (!supportable_dr_alignment)
1821               {
1822                 one_dr_unsupportable = true;
1823                 unsupportable_dr = dr;
1824               }
1825
1826               if (!first_store && DR_IS_WRITE (dr))
1827                 first_store = dr;
1828             }
1829         }
1830       else
1831         {
1832           if (!aligned_access_p (dr))
1833             {
1834               if (dump_enabled_p ())
1835                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836                                  "vector alignment may not be reachable\n");
1837               break;
1838             }
1839         }
1840     }
1841
1842   /* Check if we can possibly peel the loop.  */
1843   if (!vect_can_advance_ivs_p (loop_vinfo)
1844       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1845       || loop->inner)
1846     do_peeling = false;
1847
1848   struct _vect_peel_extended_info peel_for_known_alignment;
1849   struct _vect_peel_extended_info peel_for_unknown_alignment;
1850   struct _vect_peel_extended_info best_peel;
1851
1852   peel_for_unknown_alignment.inside_cost = INT_MAX;
1853   peel_for_unknown_alignment.outside_cost = INT_MAX;
1854   peel_for_unknown_alignment.peel_info.count = 0;
1855
1856   if (do_peeling
1857       && one_misalignment_unknown)
1858     {
1859       /* Check if the target requires to prefer stores over loads, i.e., if
1860          misaligned stores are more expensive than misaligned loads (taking
1861          drs with same alignment into account).  */
1862       unsigned int load_inside_cost = 0;
1863       unsigned int load_outside_cost = 0;
1864       unsigned int store_inside_cost = 0;
1865       unsigned int store_outside_cost = 0;
1866       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1867
1868       stmt_vector_for_cost dummy;
1869       dummy.create (2);
1870       vect_get_peeling_costs_all_drs (datarefs, dr0,
1871                                       &load_inside_cost,
1872                                       &load_outside_cost,
1873                                       &dummy, &dummy, estimated_npeels, true);
1874       dummy.release ();
1875
1876       if (first_store)
1877         {
1878           dummy.create (2);
1879           vect_get_peeling_costs_all_drs (datarefs, first_store,
1880                                           &store_inside_cost,
1881                                           &store_outside_cost,
1882                                           &dummy, &dummy,
1883                                           estimated_npeels, true);
1884           dummy.release ();
1885         }
1886       else
1887         {
1888           store_inside_cost = INT_MAX;
1889           store_outside_cost = INT_MAX;
1890         }
1891
1892       if (load_inside_cost > store_inside_cost
1893           || (load_inside_cost == store_inside_cost
1894               && load_outside_cost > store_outside_cost))
1895         {
1896           dr0 = first_store;
1897           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1898           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1899         }
1900       else
1901         {
1902           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1903           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1904         }
1905
1906       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1907       prologue_cost_vec.create (2);
1908       epilogue_cost_vec.create (2);
1909
1910       int dummy2;
1911       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1912         (loop_vinfo, estimated_npeels, &dummy2,
1913          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1914          &prologue_cost_vec, &epilogue_cost_vec);
1915
1916       prologue_cost_vec.release ();
1917       epilogue_cost_vec.release ();
1918
1919       peel_for_unknown_alignment.peel_info.count = 1
1920         + STMT_VINFO_SAME_ALIGN_REFS
1921         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1922     }
1923
1924   peel_for_unknown_alignment.peel_info.npeel = 0;
1925   peel_for_unknown_alignment.peel_info.dr = dr0;
1926
1927   best_peel = peel_for_unknown_alignment;
1928
1929   peel_for_known_alignment.inside_cost = INT_MAX;
1930   peel_for_known_alignment.outside_cost = INT_MAX;
1931   peel_for_known_alignment.peel_info.count = 0;
1932   peel_for_known_alignment.peel_info.dr = NULL;
1933
1934   if (do_peeling && one_misalignment_known)
1935     {
1936       /* Peeling is possible, but there is no data access that is not supported
1937          unless aligned.  So we try to choose the best possible peeling from
1938          the hash table.  */
1939       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1940         (&peeling_htab, loop_vinfo);
1941     }
1942
1943   /* Compare costs of peeling for known and unknown alignment. */
1944   if (peel_for_known_alignment.peel_info.dr != NULL
1945       && peel_for_unknown_alignment.inside_cost
1946       >= peel_for_known_alignment.inside_cost)
1947     {
1948       best_peel = peel_for_known_alignment;
1949
1950       /* If the best peeling for known alignment has NPEEL == 0, perform no
1951          peeling at all except if there is an unsupportable dr that we can
1952          align.  */
1953       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1954         do_peeling = false;
1955     }
1956
1957   /* If there is an unsupportable data ref, prefer this over all choices so far
1958      since we'd have to discard a chosen peeling except when it accidentally
1959      aligned the unsupportable data ref.  */
1960   if (one_dr_unsupportable)
1961     dr0 = unsupportable_dr;
1962   else if (do_peeling)
1963     {
1964       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1965          TODO: Use nopeel_outside_cost or get rid of it?  */
1966       unsigned nopeel_inside_cost = 0;
1967       unsigned nopeel_outside_cost = 0;
1968
1969       stmt_vector_for_cost dummy;
1970       dummy.create (2);
1971       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1972                                       &nopeel_outside_cost, &dummy, &dummy,
1973                                       0, false);
1974       dummy.release ();
1975
1976       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1977          costs will be recorded.  */
1978       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1979       prologue_cost_vec.create (2);
1980       epilogue_cost_vec.create (2);
1981
1982       int dummy2;
1983       nopeel_outside_cost += vect_get_known_peeling_cost
1984         (loop_vinfo, 0, &dummy2,
1985          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1986          &prologue_cost_vec, &epilogue_cost_vec);
1987
1988       prologue_cost_vec.release ();
1989       epilogue_cost_vec.release ();
1990
1991       npeel = best_peel.peel_info.npeel;
1992       dr0 = best_peel.peel_info.dr;
1993
1994       /* If no peeling is not more expensive than the best peeling we
1995          have so far, don't perform any peeling.  */
1996       if (nopeel_inside_cost <= best_peel.inside_cost)
1997         do_peeling = false;
1998     }
1999
2000   if (do_peeling)
2001     {
2002       stmt = DR_STMT (dr0);
2003       stmt_info = vinfo_for_stmt (stmt);
2004       vectype = STMT_VINFO_VECTYPE (stmt_info);
2005
2006       if (known_alignment_for_access_p (dr0))
2007         {
2008           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2009                                                 size_zero_node) < 0;
2010           if (!npeel)
2011             {
2012               /* Since it's known at compile time, compute the number of
2013                  iterations in the peeled loop (the peeling factor) for use in
2014                  updating DR_MISALIGNMENT values.  The peeling factor is the
2015                  vectorization factor minus the misalignment as an element
2016                  count.  */
2017               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2018               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2019               npeel = ((mis & (target_align - 1))
2020                        / vect_get_scalar_dr_size (dr0));
2021             }
2022
2023           /* For interleaved data access every iteration accesses all the
2024              members of the group, therefore we divide the number of iterations
2025              by the group size.  */
2026           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2027           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2028             npeel /= GROUP_SIZE (stmt_info);
2029
2030           if (dump_enabled_p ())
2031             dump_printf_loc (MSG_NOTE, vect_location,
2032                              "Try peeling by %d\n", npeel);
2033         }
2034
2035       /* Ensure that all datarefs can be vectorized after the peel.  */
2036       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2037         do_peeling = false;
2038
2039       /* Check if all datarefs are supportable and log.  */
2040       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2041         {
2042           stat = vect_verify_datarefs_alignment (loop_vinfo);
2043           if (!stat)
2044             do_peeling = false;
2045           else
2046             return stat;
2047         }
2048
2049       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2050       if (do_peeling)
2051         {
2052           unsigned max_allowed_peel
2053             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2054           if (max_allowed_peel != (unsigned)-1)
2055             {
2056               unsigned max_peel = npeel;
2057               if (max_peel == 0)
2058                 {
2059                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2060                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2061                 }
2062               if (max_peel > max_allowed_peel)
2063                 {
2064                   do_peeling = false;
2065                   if (dump_enabled_p ())
2066                     dump_printf_loc (MSG_NOTE, vect_location,
2067                         "Disable peeling, max peels reached: %d\n", max_peel);
2068                 }
2069             }
2070         }
2071
2072       /* Cost model #2 - if peeling may result in a remaining loop not
2073          iterating enough to be vectorized then do not peel.  Since this
2074          is a cost heuristic rather than a correctness decision, use the
2075          most likely runtime value for variable vectorization factors.  */
2076       if (do_peeling
2077           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2078         {
2079           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2080           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2081           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2082               < assumed_vf + max_peel)
2083             do_peeling = false;
2084         }
2085
2086       if (do_peeling)
2087         {
2088           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2089              If the misalignment of DR_i is identical to that of dr0 then set
2090              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2091              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2092              by the peeling factor times the element size of DR_i (MOD the
2093              vectorization factor times the size).  Otherwise, the
2094              misalignment of DR_i must be set to unknown.  */
2095           FOR_EACH_VEC_ELT (datarefs, i, dr)
2096             if (dr != dr0)
2097               {
2098                 /* Strided accesses perform only component accesses, alignment
2099                    is irrelevant for them.  */
2100                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2101                 if (STMT_VINFO_STRIDED_P (stmt_info)
2102                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2103                   continue;
2104
2105                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2106               }
2107
2108           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2109           if (npeel)
2110             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2111           else
2112             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2113               = DR_MISALIGNMENT (dr0);
2114           SET_DR_MISALIGNMENT (dr0, 0);
2115           if (dump_enabled_p ())
2116             {
2117               dump_printf_loc (MSG_NOTE, vect_location,
2118                                "Alignment of access forced using peeling.\n");
2119               dump_printf_loc (MSG_NOTE, vect_location,
2120                                "Peeling for alignment will be applied.\n");
2121             }
2122
2123           /* The inside-loop cost will be accounted for in vectorizable_load
2124              and vectorizable_store correctly with adjusted alignments.
2125              Drop the body_cst_vec on the floor here.  */
2126           stat = vect_verify_datarefs_alignment (loop_vinfo);
2127           gcc_assert (stat);
2128           return stat;
2129         }
2130     }
2131
2132   /* (2) Versioning to force alignment.  */
2133
2134   /* Try versioning if:
2135      1) optimize loop for speed
2136      2) there is at least one unsupported misaligned data ref with an unknown
2137         misalignment, and
2138      3) all misaligned data refs with a known misalignment are supported, and
2139      4) the number of runtime alignment checks is within reason.  */
2140
2141   do_versioning =
2142         optimize_loop_nest_for_speed_p (loop)
2143         && (!loop->inner); /* FORNOW */
2144
2145   if (do_versioning)
2146     {
2147       FOR_EACH_VEC_ELT (datarefs, i, dr)
2148         {
2149           stmt = DR_STMT (dr);
2150           stmt_info = vinfo_for_stmt (stmt);
2151
2152           /* For interleaving, only the alignment of the first access
2153              matters.  */
2154           if (aligned_access_p (dr)
2155               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2156                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2157             continue;
2158
2159           if (STMT_VINFO_STRIDED_P (stmt_info))
2160             {
2161               /* Strided loads perform only component accesses, alignment is
2162                  irrelevant for them.  */
2163               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2164                 continue;
2165               do_versioning = false;
2166               break;
2167             }
2168
2169           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2170
2171           if (!supportable_dr_alignment)
2172             {
2173               gimple *stmt;
2174               int mask;
2175               tree vectype;
2176
2177               if (known_alignment_for_access_p (dr)
2178                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2179                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2180                 {
2181                   do_versioning = false;
2182                   break;
2183                 }
2184
2185               stmt = DR_STMT (dr);
2186               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2187               gcc_assert (vectype);
2188
2189               /* At present we don't support versioning for alignment
2190                  with variable VF, since there's no guarantee that the
2191                  VF is a power of two.  We could relax this if we added
2192                  a way of enforcing a power-of-two size.  */
2193               unsigned HOST_WIDE_INT size;
2194               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2195                 {
2196                   do_versioning = false;
2197                   break;
2198                 }
2199
2200               /* The rightmost bits of an aligned address must be zeros.
2201                  Construct the mask needed for this test.  For example,
2202                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2203                  mask must be 15 = 0xf. */
2204               mask = size - 1;
2205
2206               /* FORNOW: use the same mask to test all potentially unaligned
2207                  references in the loop.  The vectorizer currently supports
2208                  a single vector size, see the reference to
2209                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2210                  vectorization factor is computed.  */
2211               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2212                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2213               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2214               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2215                       DR_STMT (dr));
2216             }
2217         }
2218
2219       /* Versioning requires at least one misaligned data reference.  */
2220       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2221         do_versioning = false;
2222       else if (!do_versioning)
2223         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2224     }
2225
2226   if (do_versioning)
2227     {
2228       vec<gimple *> may_misalign_stmts
2229         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2230       gimple *stmt;
2231
2232       /* It can now be assumed that the data references in the statements
2233          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2234          of the loop being vectorized.  */
2235       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2236         {
2237           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2238           dr = STMT_VINFO_DATA_REF (stmt_info);
2239           SET_DR_MISALIGNMENT (dr, 0);
2240           if (dump_enabled_p ())
2241             dump_printf_loc (MSG_NOTE, vect_location,
2242                              "Alignment of access forced using versioning.\n");
2243         }
2244
2245       if (dump_enabled_p ())
2246         dump_printf_loc (MSG_NOTE, vect_location,
2247                          "Versioning for alignment will be applied.\n");
2248
2249       /* Peeling and versioning can't be done together at this time.  */
2250       gcc_assert (! (do_peeling && do_versioning));
2251
2252       stat = vect_verify_datarefs_alignment (loop_vinfo);
2253       gcc_assert (stat);
2254       return stat;
2255     }
2256
2257   /* This point is reached if neither peeling nor versioning is being done.  */
2258   gcc_assert (! (do_peeling || do_versioning));
2259
2260   stat = vect_verify_datarefs_alignment (loop_vinfo);
2261   return stat;
2262 }
2263
2264
2265 /* Function vect_find_same_alignment_drs.
2266
2267    Update group and alignment relations according to the chosen
2268    vectorization factor.  */
2269
2270 static void
2271 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2272 {
2273   struct data_reference *dra = DDR_A (ddr);
2274   struct data_reference *drb = DDR_B (ddr);
2275   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2276   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2277
2278   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2279     return;
2280
2281   if (dra == drb)
2282     return;
2283
2284   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2285       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2286       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2287     return;
2288
2289   /* Two references with distance zero have the same alignment.  */
2290   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2291                           - wi::to_poly_offset (DR_INIT (drb)));
2292   if (maybe_ne (diff, 0))
2293     {
2294       /* Get the wider of the two alignments.  */
2295       unsigned int align_a = (vect_calculate_target_alignment (dra)
2296                               / BITS_PER_UNIT);
2297       unsigned int align_b = (vect_calculate_target_alignment (drb)
2298                               / BITS_PER_UNIT);
2299       unsigned int max_align = MAX (align_a, align_b);
2300
2301       /* Require the gap to be a multiple of the larger vector alignment.  */
2302       if (!multiple_p (diff, max_align))
2303         return;
2304     }
2305
2306   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2307   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2308   if (dump_enabled_p ())
2309     {
2310       dump_printf_loc (MSG_NOTE, vect_location,
2311                        "accesses have the same alignment: ");
2312       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2313       dump_printf (MSG_NOTE,  " and ");
2314       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2315       dump_printf (MSG_NOTE, "\n");
2316     }
2317 }
2318
2319
2320 /* Function vect_analyze_data_refs_alignment
2321
2322    Analyze the alignment of the data-references in the loop.
2323    Return FALSE if a data reference is found that cannot be vectorized.  */
2324
2325 bool
2326 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2327 {
2328   if (dump_enabled_p ())
2329     dump_printf_loc (MSG_NOTE, vect_location,
2330                      "=== vect_analyze_data_refs_alignment ===\n");
2331
2332   /* Mark groups of data references with same alignment using
2333      data dependence information.  */
2334   vec<ddr_p> ddrs = vinfo->ddrs;
2335   struct data_dependence_relation *ddr;
2336   unsigned int i;
2337
2338   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2339     vect_find_same_alignment_drs (ddr);
2340
2341   vec<data_reference_p> datarefs = vinfo->datarefs;
2342   struct data_reference *dr;
2343
2344   vect_record_base_alignments (vinfo);
2345   FOR_EACH_VEC_ELT (datarefs, i, dr)
2346     {
2347       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2348       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2349           && !vect_compute_data_ref_alignment (dr))
2350         {
2351           /* Strided accesses perform only component accesses, misalignment
2352              information is irrelevant for them.  */
2353           if (STMT_VINFO_STRIDED_P (stmt_info)
2354               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2355             continue;
2356
2357           if (dump_enabled_p ())
2358             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359                              "not vectorized: can't calculate alignment "
2360                              "for data ref.\n");
2361
2362           return false;
2363         }
2364     }
2365
2366   return true;
2367 }
2368
2369
2370 /* Analyze alignment of DRs of stmts in NODE.  */
2371
2372 static bool
2373 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2374 {
2375   /* We vectorize from the first scalar stmt in the node unless
2376      the node is permuted in which case we start from the first
2377      element in the group.  */
2378   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2379   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2380   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2381     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2382
2383   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2384   if (! vect_compute_data_ref_alignment (dr)
2385       /* For creating the data-ref pointer we need alignment of the
2386          first element anyway.  */
2387       || (dr != first_dr
2388           && ! vect_compute_data_ref_alignment (first_dr))
2389       || ! verify_data_ref_alignment (dr))
2390     {
2391       if (dump_enabled_p ())
2392         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393                          "not vectorized: bad data alignment in basic "
2394                          "block.\n");
2395       return false;
2396     }
2397
2398   return true;
2399 }
2400
2401 /* Function vect_slp_analyze_instance_alignment
2402
2403    Analyze the alignment of the data-references in the SLP instance.
2404    Return FALSE if a data reference is found that cannot be vectorized.  */
2405
2406 bool
2407 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2408 {
2409   if (dump_enabled_p ())
2410     dump_printf_loc (MSG_NOTE, vect_location,
2411                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2412
2413   slp_tree node;
2414   unsigned i;
2415   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2416     if (! vect_slp_analyze_and_verify_node_alignment (node))
2417       return false;
2418
2419   node = SLP_INSTANCE_TREE (instance);
2420   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2421       && ! vect_slp_analyze_and_verify_node_alignment
2422              (SLP_INSTANCE_TREE (instance)))
2423     return false;
2424
2425   return true;
2426 }
2427
2428
2429 /* Analyze groups of accesses: check that DR belongs to a group of
2430    accesses of legal size, step, etc.  Detect gaps, single element
2431    interleaving, and other special cases. Set grouped access info.
2432    Collect groups of strided stores for further use in SLP analysis.
2433    Worker for vect_analyze_group_access.  */
2434
2435 static bool
2436 vect_analyze_group_access_1 (struct data_reference *dr)
2437 {
2438   tree step = DR_STEP (dr);
2439   tree scalar_type = TREE_TYPE (DR_REF (dr));
2440   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2441   gimple *stmt = DR_STMT (dr);
2442   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2443   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2444   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2445   HOST_WIDE_INT dr_step = -1;
2446   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2447   bool slp_impossible = false;
2448
2449   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2450      size of the interleaving group (including gaps).  */
2451   if (tree_fits_shwi_p (step))
2452     {
2453       dr_step = tree_to_shwi (step);
2454       /* Check that STEP is a multiple of type size.  Otherwise there is
2455          a non-element-sized gap at the end of the group which we
2456          cannot represent in GROUP_GAP or GROUP_SIZE.
2457          ???  As we can handle non-constant step fine here we should
2458          simply remove uses of GROUP_GAP between the last and first
2459          element and instead rely on DR_STEP.  GROUP_SIZE then would
2460          simply not include that gap.  */
2461       if ((dr_step % type_size) != 0)
2462         {
2463           if (dump_enabled_p ())
2464             {
2465               dump_printf_loc (MSG_NOTE, vect_location,
2466                                "Step ");
2467               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2468               dump_printf (MSG_NOTE,
2469                            " is not a multiple of the element size for ");
2470               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2471               dump_printf (MSG_NOTE, "\n");
2472             }
2473           return false;
2474         }
2475       groupsize = absu_hwi (dr_step) / type_size;
2476     }
2477   else
2478     groupsize = 0;
2479
2480   /* Not consecutive access is possible only if it is a part of interleaving.  */
2481   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2482     {
2483       /* Check if it this DR is a part of interleaving, and is a single
2484          element of the group that is accessed in the loop.  */
2485
2486       /* Gaps are supported only for loads. STEP must be a multiple of the type
2487          size.  */
2488       if (DR_IS_READ (dr)
2489           && (dr_step % type_size) == 0
2490           && groupsize > 0)
2491         {
2492           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2493           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2494           GROUP_GAP (stmt_info) = groupsize - 1;
2495           if (dump_enabled_p ())
2496             {
2497               dump_printf_loc (MSG_NOTE, vect_location,
2498                                "Detected single element interleaving ");
2499               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2500               dump_printf (MSG_NOTE, " step ");
2501               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2502               dump_printf (MSG_NOTE, "\n");
2503             }
2504
2505           return true;
2506         }
2507
2508       if (dump_enabled_p ())
2509         {
2510           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2511                            "not consecutive access ");
2512           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2513         }
2514
2515       if (bb_vinfo)
2516         {
2517           /* Mark the statement as unvectorizable.  */
2518           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2519           return true;
2520         }
2521
2522       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2523       STMT_VINFO_STRIDED_P (stmt_info) = true;
2524       return true;
2525     }
2526
2527   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2528     {
2529       /* First stmt in the interleaving chain. Check the chain.  */
2530       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2531       struct data_reference *data_ref = dr;
2532       unsigned int count = 1;
2533       tree prev_init = DR_INIT (data_ref);
2534       gimple *prev = stmt;
2535       HOST_WIDE_INT diff, gaps = 0;
2536
2537       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2538       while (next)
2539         {
2540           /* Skip same data-refs.  In case that two or more stmts share
2541              data-ref (supported only for loads), we vectorize only the first
2542              stmt, and the rest get their vectorized loads from the first
2543              one.  */
2544           if (!tree_int_cst_compare (DR_INIT (data_ref),
2545                                      DR_INIT (STMT_VINFO_DATA_REF (
2546                                                    vinfo_for_stmt (next)))))
2547             {
2548               if (DR_IS_WRITE (data_ref))
2549                 {
2550                   if (dump_enabled_p ())
2551                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2552                                      "Two store stmts share the same dr.\n");
2553                   return false;
2554                 }
2555
2556               if (dump_enabled_p ())
2557                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558                                  "Two or more load stmts share the same dr.\n");
2559
2560               /* For load use the same data-ref load.  */
2561               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2562
2563               prev = next;
2564               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2565               continue;
2566             }
2567
2568           prev = next;
2569           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2570
2571           /* All group members have the same STEP by construction.  */
2572           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573
2574           /* Check that the distance between two accesses is equal to the type
2575              size. Otherwise, we have gaps.  */
2576           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2578           if (diff != 1)
2579             {
2580               /* FORNOW: SLP of accesses with gaps is not supported.  */
2581               slp_impossible = true;
2582               if (DR_IS_WRITE (data_ref))
2583                 {
2584                   if (dump_enabled_p ())
2585                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586                                      "interleaved store with gaps\n");
2587                   return false;
2588                 }
2589
2590               gaps += diff - 1;
2591             }
2592
2593           last_accessed_element += diff;
2594
2595           /* Store the gap from the previous member of the group. If there is no
2596              gap in the access, GROUP_GAP is always 1.  */
2597           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2598
2599           prev_init = DR_INIT (data_ref);
2600           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2601           /* Count the number of data-refs in the chain.  */
2602           count++;
2603         }
2604
2605       if (groupsize == 0)
2606         groupsize = count + gaps;
2607
2608       /* This could be UINT_MAX but as we are generating code in a very
2609          inefficient way we have to cap earlier.  See PR78699 for example.  */
2610       if (groupsize > 4096)
2611         {
2612           if (dump_enabled_p ())
2613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614                              "group is too large\n");
2615           return false;
2616         }
2617
2618       /* Check that the size of the interleaving is equal to count for stores,
2619          i.e., that there are no gaps.  */
2620       if (groupsize != count
2621           && !DR_IS_READ (dr))
2622         {
2623           if (dump_enabled_p ())
2624             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625                              "interleaved store with gaps\n");
2626           return false;
2627         }
2628
2629       /* If there is a gap after the last load in the group it is the
2630          difference between the groupsize and the last accessed
2631          element.
2632          When there is no gap, this difference should be 0.  */
2633       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2634
2635       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2636       if (dump_enabled_p ())
2637         {
2638           dump_printf_loc (MSG_NOTE, vect_location,
2639                            "Detected interleaving ");
2640           if (DR_IS_READ (dr))
2641             dump_printf (MSG_NOTE, "load ");
2642           else
2643             dump_printf (MSG_NOTE, "store ");
2644           dump_printf (MSG_NOTE, "of size %u starting with ",
2645                        (unsigned)groupsize);
2646           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2647           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2648             dump_printf_loc (MSG_NOTE, vect_location,
2649                              "There is a gap of %u elements after the group\n",
2650                              GROUP_GAP (vinfo_for_stmt (stmt)));
2651         }
2652
2653       /* SLP: create an SLP data structure for every interleaving group of
2654          stores for further analysis in vect_analyse_slp.  */
2655       if (DR_IS_WRITE (dr) && !slp_impossible)
2656         {
2657           if (loop_vinfo)
2658             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2659           if (bb_vinfo)
2660             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2661         }
2662     }
2663
2664   return true;
2665 }
2666
2667 /* Analyze groups of accesses: check that DR belongs to a group of
2668    accesses of legal size, step, etc.  Detect gaps, single element
2669    interleaving, and other special cases. Set grouped access info.
2670    Collect groups of strided stores for further use in SLP analysis.  */
2671
2672 static bool
2673 vect_analyze_group_access (struct data_reference *dr)
2674 {
2675   if (!vect_analyze_group_access_1 (dr))
2676     {
2677       /* Dissolve the group if present.  */
2678       gimple *next;
2679       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2680       while (stmt)
2681         {
2682           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2683           next = GROUP_NEXT_ELEMENT (vinfo);
2684           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2685           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2686           stmt = next;
2687         }
2688       return false;
2689     }
2690   return true;
2691 }
2692
2693 /* Analyze the access pattern of the data-reference DR.
2694    In case of non-consecutive accesses call vect_analyze_group_access() to
2695    analyze groups of accesses.  */
2696
2697 static bool
2698 vect_analyze_data_ref_access (struct data_reference *dr)
2699 {
2700   tree step = DR_STEP (dr);
2701   tree scalar_type = TREE_TYPE (DR_REF (dr));
2702   gimple *stmt = DR_STMT (dr);
2703   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2704   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2705   struct loop *loop = NULL;
2706
2707   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2708     return true;
2709
2710   if (loop_vinfo)
2711     loop = LOOP_VINFO_LOOP (loop_vinfo);
2712
2713   if (loop_vinfo && !step)
2714     {
2715       if (dump_enabled_p ())
2716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2717                          "bad data-ref access in loop\n");
2718       return false;
2719     }
2720
2721   /* Allow loads with zero step in inner-loop vectorization.  */
2722   if (loop_vinfo && integer_zerop (step))
2723     {
2724       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2725       if (!nested_in_vect_loop_p (loop, stmt))
2726         return DR_IS_READ (dr);
2727       /* Allow references with zero step for outer loops marked
2728          with pragma omp simd only - it guarantees absence of
2729          loop-carried dependencies between inner loop iterations.  */
2730       if (loop->safelen < 2)
2731         {
2732           if (dump_enabled_p ())
2733             dump_printf_loc (MSG_NOTE, vect_location,
2734                              "zero step in inner loop of nest\n");
2735           return false;
2736         }
2737     }
2738
2739   if (loop && nested_in_vect_loop_p (loop, stmt))
2740     {
2741       /* Interleaved accesses are not yet supported within outer-loop
2742         vectorization for references in the inner-loop.  */
2743       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2744
2745       /* For the rest of the analysis we use the outer-loop step.  */
2746       step = STMT_VINFO_DR_STEP (stmt_info);
2747       if (integer_zerop (step))
2748         {
2749           if (dump_enabled_p ())
2750             dump_printf_loc (MSG_NOTE, vect_location,
2751                              "zero step in outer loop.\n");
2752           return DR_IS_READ (dr);
2753         }
2754     }
2755
2756   /* Consecutive?  */
2757   if (TREE_CODE (step) == INTEGER_CST)
2758     {
2759       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2760       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2761           || (dr_step < 0
2762               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2763         {
2764           /* Mark that it is not interleaving.  */
2765           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2766           return true;
2767         }
2768     }
2769
2770   if (loop && nested_in_vect_loop_p (loop, stmt))
2771     {
2772       if (dump_enabled_p ())
2773         dump_printf_loc (MSG_NOTE, vect_location,
2774                          "grouped access in outer loop.\n");
2775       return false;
2776     }
2777
2778
2779   /* Assume this is a DR handled by non-constant strided load case.  */
2780   if (TREE_CODE (step) != INTEGER_CST)
2781     return (STMT_VINFO_STRIDED_P (stmt_info)
2782             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2783                 || vect_analyze_group_access (dr)));
2784
2785   /* Not consecutive access - check if it's a part of interleaving group.  */
2786   return vect_analyze_group_access (dr);
2787 }
2788
2789 /* Compare two data-references DRA and DRB to group them into chunks
2790    suitable for grouping.  */
2791
2792 static int
2793 dr_group_sort_cmp (const void *dra_, const void *drb_)
2794 {
2795   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2796   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2797   int cmp;
2798
2799   /* Stabilize sort.  */
2800   if (dra == drb)
2801     return 0;
2802
2803   /* DRs in different loops never belong to the same group.  */
2804   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2805   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2806   if (loopa != loopb)
2807     return loopa->num < loopb->num ? -1 : 1;
2808
2809   /* Ordering of DRs according to base.  */
2810   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2811                                DR_BASE_ADDRESS (drb));
2812   if (cmp != 0)
2813     return cmp;
2814
2815   /* And according to DR_OFFSET.  */
2816   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2817   if (cmp != 0)
2818     return cmp;
2819
2820   /* Put reads before writes.  */
2821   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2822     return DR_IS_READ (dra) ? -1 : 1;
2823
2824   /* Then sort after access size.  */
2825   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2826                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2827   if (cmp != 0)
2828     return cmp;
2829
2830   /* And after step.  */
2831   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2832   if (cmp != 0)
2833     return cmp;
2834
2835   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2836   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2837   if (cmp == 0)
2838     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2839   return cmp;
2840 }
2841
2842 /* If OP is the result of a conversion, return the unconverted value,
2843    otherwise return null.  */
2844
2845 static tree
2846 strip_conversion (tree op)
2847 {
2848   if (TREE_CODE (op) != SSA_NAME)
2849     return NULL_TREE;
2850   gimple *stmt = SSA_NAME_DEF_STMT (op);
2851   if (!is_gimple_assign (stmt)
2852       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2853     return NULL_TREE;
2854   return gimple_assign_rhs1 (stmt);
2855 }
2856
2857 /* Return true if vectorizable_* routines can handle statements STMT1
2858    and STMT2 being in a single group.  */
2859
2860 static bool
2861 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2862 {
2863   if (gimple_assign_single_p (stmt1))
2864     return gimple_assign_single_p (stmt2);
2865
2866   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2867     {
2868       /* Check for two masked loads or two masked stores.  */
2869       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2870         return false;
2871       internal_fn ifn = gimple_call_internal_fn (stmt1);
2872       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2873         return false;
2874       if (ifn != gimple_call_internal_fn (stmt2))
2875         return false;
2876
2877       /* Check that the masks are the same.  Cope with casts of masks,
2878          like those created by build_mask_conversion.  */
2879       tree mask1 = gimple_call_arg (stmt1, 2);
2880       tree mask2 = gimple_call_arg (stmt2, 2);
2881       if (!operand_equal_p (mask1, mask2, 0))
2882         {
2883           mask1 = strip_conversion (mask1);
2884           if (!mask1)
2885             return false;
2886           mask2 = strip_conversion (mask2);
2887           if (!mask2)
2888             return false;
2889           if (!operand_equal_p (mask1, mask2, 0))
2890             return false;
2891         }
2892       return true;
2893     }
2894
2895   return false;
2896 }
2897
2898 /* Function vect_analyze_data_ref_accesses.
2899
2900    Analyze the access pattern of all the data references in the loop.
2901
2902    FORNOW: the only access pattern that is considered vectorizable is a
2903            simple step 1 (consecutive) access.
2904
2905    FORNOW: handle only arrays and pointer accesses.  */
2906
2907 bool
2908 vect_analyze_data_ref_accesses (vec_info *vinfo)
2909 {
2910   unsigned int i;
2911   vec<data_reference_p> datarefs = vinfo->datarefs;
2912   struct data_reference *dr;
2913
2914   if (dump_enabled_p ())
2915     dump_printf_loc (MSG_NOTE, vect_location,
2916                      "=== vect_analyze_data_ref_accesses ===\n");
2917
2918   if (datarefs.is_empty ())
2919     return true;
2920
2921   /* Sort the array of datarefs to make building the interleaving chains
2922      linear.  Don't modify the original vector's order, it is needed for
2923      determining what dependencies are reversed.  */
2924   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2925   datarefs_copy.qsort (dr_group_sort_cmp);
2926
2927   /* Build the interleaving chains.  */
2928   for (i = 0; i < datarefs_copy.length () - 1;)
2929     {
2930       data_reference_p dra = datarefs_copy[i];
2931       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2932       stmt_vec_info lastinfo = NULL;
2933       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2934           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2935         {
2936           ++i;
2937           continue;
2938         }
2939       for (i = i + 1; i < datarefs_copy.length (); ++i)
2940         {
2941           data_reference_p drb = datarefs_copy[i];
2942           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2943           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2944               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2945             break;
2946
2947           /* ???  Imperfect sorting (non-compatible types, non-modulo
2948              accesses, same accesses) can lead to a group to be artificially
2949              split here as we don't just skip over those.  If it really
2950              matters we can push those to a worklist and re-iterate
2951              over them.  The we can just skip ahead to the next DR here.  */
2952
2953           /* DRs in a different loop should not be put into the same
2954              interleaving group.  */
2955           if (gimple_bb (DR_STMT (dra))->loop_father
2956               != gimple_bb (DR_STMT (drb))->loop_father)
2957             break;
2958
2959           /* Check that the data-refs have same first location (except init)
2960              and they are both either store or load (not load and store,
2961              not masked loads or stores).  */
2962           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2963               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2964                                         DR_BASE_ADDRESS (drb)) != 0
2965               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2966               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2967             break;
2968
2969           /* Check that the data-refs have the same constant size.  */
2970           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2971           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2972           if (!tree_fits_uhwi_p (sza)
2973               || !tree_fits_uhwi_p (szb)
2974               || !tree_int_cst_equal (sza, szb))
2975             break;
2976
2977           /* Check that the data-refs have the same step.  */
2978           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2979             break;
2980
2981           /* Check the types are compatible.
2982              ???  We don't distinguish this during sorting.  */
2983           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2984                                    TREE_TYPE (DR_REF (drb))))
2985             break;
2986
2987           /* Check that the DR_INITs are compile-time constants.  */
2988           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2989               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2990             break;
2991
2992           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2993           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2994           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2995           HOST_WIDE_INT init_prev
2996             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2997           gcc_assert (init_a <= init_b
2998                       && init_a <= init_prev
2999                       && init_prev <= init_b);
3000
3001           /* Do not place the same access in the interleaving chain twice.  */
3002           if (init_b == init_prev)
3003             {
3004               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3005                           < gimple_uid (DR_STMT (drb)));
3006               /* ???  For now we simply "drop" the later reference which is
3007                  otherwise the same rather than finishing off this group.
3008                  In the end we'd want to re-process duplicates forming
3009                  multiple groups from the refs, likely by just collecting
3010                  all candidates (including duplicates and split points
3011                  below) in a vector and then process them together.  */
3012               continue;
3013             }
3014
3015           /* If init_b == init_a + the size of the type * k, we have an
3016              interleaving, and DRA is accessed before DRB.  */
3017           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3018           if (type_size_a == 0
3019               || (init_b - init_a) % type_size_a != 0)
3020             break;
3021
3022           /* If we have a store, the accesses are adjacent.  This splits
3023              groups into chunks we support (we don't support vectorization
3024              of stores with gaps).  */
3025           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3026             break;
3027
3028           /* If the step (if not zero or non-constant) is greater than the
3029              difference between data-refs' inits this splits groups into
3030              suitable sizes.  */
3031           if (tree_fits_shwi_p (DR_STEP (dra)))
3032             {
3033               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3034               if (step != 0 && step <= (init_b - init_a))
3035                 break;
3036             }
3037
3038           if (dump_enabled_p ())
3039             {
3040               dump_printf_loc (MSG_NOTE, vect_location,
3041                                "Detected interleaving ");
3042               if (DR_IS_READ (dra))
3043                 dump_printf (MSG_NOTE, "load ");
3044               else
3045                 dump_printf (MSG_NOTE, "store ");
3046               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3047               dump_printf (MSG_NOTE,  " and ");
3048               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3049               dump_printf (MSG_NOTE, "\n");
3050             }
3051
3052           /* Link the found element into the group list.  */
3053           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3054             {
3055               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3056               lastinfo = stmtinfo_a;
3057             }
3058           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3059           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3060           lastinfo = stmtinfo_b;
3061         }
3062     }
3063
3064   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3065     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3066         && !vect_analyze_data_ref_access (dr))
3067       {
3068         if (dump_enabled_p ())
3069           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3070                            "not vectorized: complicated access pattern.\n");
3071
3072         if (is_a <bb_vec_info> (vinfo))
3073           {
3074             /* Mark the statement as not vectorizable.  */
3075             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3076             continue;
3077           }
3078         else
3079           {
3080             datarefs_copy.release ();
3081             return false;
3082           }
3083       }
3084
3085   datarefs_copy.release ();
3086   return true;
3087 }
3088
3089 /* Function vect_vfa_segment_size.
3090
3091    Input:
3092      DR: The data reference.
3093      LENGTH_FACTOR: segment length to consider.
3094
3095    Return a value suitable for the dr_with_seg_len::seg_len field.
3096    This is the "distance travelled" by the pointer from the first
3097    iteration in the segment to the last.  Note that it does not include
3098    the size of the access; in effect it only describes the first byte.  */
3099
3100 static tree
3101 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3102 {
3103   length_factor = size_binop (MINUS_EXPR,
3104                               fold_convert (sizetype, length_factor),
3105                               size_one_node);
3106   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3107                      length_factor);
3108 }
3109
3110 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3111    gives the worst-case number of bytes covered by the segment.  */
3112
3113 static unsigned HOST_WIDE_INT
3114 vect_vfa_access_size (data_reference *dr)
3115 {
3116   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3117   tree ref_type = TREE_TYPE (DR_REF (dr));
3118   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3119   unsigned HOST_WIDE_INT access_size = ref_size;
3120   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3121     {
3122       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3123       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3124     }
3125   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3126       && (vect_supportable_dr_alignment (dr, false)
3127           == dr_explicit_realign_optimized))
3128     {
3129       /* We might access a full vector's worth.  */
3130       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3131       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3132     }
3133   return access_size;
3134 }
3135
3136 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3137
3138 static unsigned int
3139 vect_vfa_align (const data_reference *dr)
3140 {
3141   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3142 }
3143
3144 /* Function vect_no_alias_p.
3145
3146    Given data references A and B with equal base and offset, see whether
3147    the alias relation can be decided at compilation time.  Return 1 if
3148    it can and the references alias, 0 if it can and the references do
3149    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3150    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3151    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3152
3153 static int
3154 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3155                          tree segment_length_a, tree segment_length_b,
3156                          unsigned HOST_WIDE_INT access_size_a,
3157                          unsigned HOST_WIDE_INT access_size_b)
3158 {
3159   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3160   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3161   poly_uint64 const_length_a;
3162   poly_uint64 const_length_b;
3163
3164   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3165      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3166      [a, a+12) */
3167   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3168     {
3169       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3170       offset_a = (offset_a + access_size_a) - const_length_a;
3171     }
3172   else
3173     const_length_a = tree_to_poly_uint64 (segment_length_a);
3174   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3175     {
3176       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3177       offset_b = (offset_b + access_size_b) - const_length_b;
3178     }
3179   else
3180     const_length_b = tree_to_poly_uint64 (segment_length_b);
3181
3182   const_length_a += access_size_a;
3183   const_length_b += access_size_b;
3184
3185   if (ranges_known_overlap_p (offset_a, const_length_a,
3186                               offset_b, const_length_b))
3187     return 1;
3188
3189   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3190                                offset_b, const_length_b))
3191     return 0;
3192
3193   return -1;
3194 }
3195
3196 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3197    in DDR is >= VF.  */
3198
3199 static bool
3200 dependence_distance_ge_vf (data_dependence_relation *ddr,
3201                            unsigned int loop_depth, poly_uint64 vf)
3202 {
3203   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3204       || DDR_NUM_DIST_VECTS (ddr) == 0)
3205     return false;
3206
3207   /* If the dependence is exact, we should have limited the VF instead.  */
3208   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3209
3210   unsigned int i;
3211   lambda_vector dist_v;
3212   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3213     {
3214       HOST_WIDE_INT dist = dist_v[loop_depth];
3215       if (dist != 0
3216           && !(dist > 0 && DDR_REVERSED_P (ddr))
3217           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3218         return false;
3219     }
3220
3221   if (dump_enabled_p ())
3222     {
3223       dump_printf_loc (MSG_NOTE, vect_location,
3224                        "dependence distance between ");
3225       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3226       dump_printf (MSG_NOTE,  " and ");
3227       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3228       dump_printf (MSG_NOTE,  " is >= VF\n");
3229     }
3230
3231   return true;
3232 }
3233
3234 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3235
3236 static void
3237 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3238 {
3239   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3240   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3241   dump_printf (dump_kind, ") >= ");
3242   dump_dec (dump_kind, lower_bound.min_value);
3243 }
3244
3245 /* Record that the vectorized loop requires the vec_lower_bound described
3246    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3247
3248 static void
3249 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3250                         poly_uint64 min_value)
3251 {
3252   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3253   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3254     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3255       {
3256         unsigned_p &= lower_bounds[i].unsigned_p;
3257         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3258         if (lower_bounds[i].unsigned_p != unsigned_p
3259             || maybe_lt (lower_bounds[i].min_value, min_value))
3260           {
3261             lower_bounds[i].unsigned_p = unsigned_p;
3262             lower_bounds[i].min_value = min_value;
3263             if (dump_enabled_p ())
3264               {
3265                 dump_printf_loc (MSG_NOTE, vect_location,
3266                                  "updating run-time check to ");
3267                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3268                 dump_printf (MSG_NOTE, "\n");
3269               }
3270           }
3271         return;
3272       }
3273
3274   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3275   if (dump_enabled_p ())
3276     {
3277       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3278       dump_lower_bound (MSG_NOTE, lower_bound);
3279       dump_printf (MSG_NOTE, "\n");
3280     }
3281   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3282 }
3283
3284 /* Return true if it's unlikely that the step of the vectorized form of DR
3285    will span fewer than GAP bytes.  */
3286
3287 static bool
3288 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3289 {
3290   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3291   HOST_WIDE_INT count
3292     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3293   if (GROUP_FIRST_ELEMENT (stmt_info))
3294     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3295   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3296 }
3297
3298 /* Return true if we know that there is no alias between DR_A and DR_B
3299    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3300    *LOWER_BOUND_OUT to this N.  */
3301
3302 static bool
3303 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3304                                 poly_uint64 *lower_bound_out)
3305 {
3306   /* Check that there is a constant gap of known sign between DR_A
3307      and DR_B.  */
3308   poly_int64 init_a, init_b;
3309   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3310       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3311       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3312       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3313       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3314       || !ordered_p (init_a, init_b))
3315     return false;
3316
3317   /* Sort DR_A and DR_B by the address they access.  */
3318   if (maybe_lt (init_b, init_a))
3319     {
3320       std::swap (init_a, init_b);
3321       std::swap (dr_a, dr_b);
3322     }
3323
3324   /* If the two accesses could be dependent within a scalar iteration,
3325      make sure that we'd retain their order.  */
3326   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3327       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3328     return false;
3329
3330   /* There is no alias if abs (DR_STEP) is greater than or equal to
3331      the bytes spanned by the combination of the two accesses.  */
3332   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3333   return true;
3334 }
3335
3336 /* Function vect_prune_runtime_alias_test_list.
3337
3338    Prune a list of ddrs to be tested at run-time by versioning for alias.
3339    Merge several alias checks into one if possible.
3340    Return FALSE if resulting list of ddrs is longer then allowed by
3341    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3342
3343 bool
3344 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3345 {
3346   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3347   hash_set <tree_pair_hash> compared_objects;
3348
3349   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3350   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3351     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3352   vec<vec_object_pair> &check_unequal_addrs
3353     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3354   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3355   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3356
3357   ddr_p ddr;
3358   unsigned int i;
3359   tree length_factor;
3360
3361   if (dump_enabled_p ())
3362     dump_printf_loc (MSG_NOTE, vect_location,
3363                      "=== vect_prune_runtime_alias_test_list ===\n");
3364
3365   /* Step values are irrelevant for aliasing if the number of vector
3366      iterations is equal to the number of scalar iterations (which can
3367      happen for fully-SLP loops).  */
3368   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3369
3370   if (!ignore_step_p)
3371     {
3372       /* Convert the checks for nonzero steps into bound tests.  */
3373       tree value;
3374       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3375         vect_check_lower_bound (loop_vinfo, value, true, 1);
3376     }
3377
3378   if (may_alias_ddrs.is_empty ())
3379     return true;
3380
3381   comp_alias_ddrs.create (may_alias_ddrs.length ());
3382
3383   unsigned int loop_depth
3384     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3385                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3386
3387   /* First, we collect all data ref pairs for aliasing checks.  */
3388   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3389     {
3390       int comp_res;
3391       poly_uint64 lower_bound;
3392       struct data_reference *dr_a, *dr_b;
3393       gimple *dr_group_first_a, *dr_group_first_b;
3394       tree segment_length_a, segment_length_b;
3395       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3396       unsigned int align_a, align_b;
3397       gimple *stmt_a, *stmt_b;
3398
3399       /* Ignore the alias if the VF we chose ended up being no greater
3400          than the dependence distance.  */
3401       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3402         continue;
3403
3404       if (DDR_OBJECT_A (ddr))
3405         {
3406           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3407           if (!compared_objects.add (new_pair))
3408             {
3409               if (dump_enabled_p ())
3410                 {
3411                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3412                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3413                   dump_printf (MSG_NOTE, " and ");
3414                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3415                   dump_printf (MSG_NOTE, " have different addresses\n");
3416                 }
3417               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3418             }
3419           continue;
3420         }
3421
3422       dr_a = DDR_A (ddr);
3423       stmt_a = DR_STMT (DDR_A (ddr));
3424
3425       dr_b = DDR_B (ddr);
3426       stmt_b = DR_STMT (DDR_B (ddr));
3427
3428       /* Skip the pair if inter-iteration dependencies are irrelevant
3429          and intra-iteration dependencies are guaranteed to be honored.  */
3430       if (ignore_step_p
3431           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3432               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3433         {
3434           if (dump_enabled_p ())
3435             {
3436               dump_printf_loc (MSG_NOTE, vect_location,
3437                                "no need for alias check between ");
3438               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3439               dump_printf (MSG_NOTE, " and ");
3440               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3441               dump_printf (MSG_NOTE, " when VF is 1\n");
3442             }
3443           continue;
3444         }
3445
3446       /* See whether we can handle the alias using a bounds check on
3447          the step, and whether that's likely to be the best approach.
3448          (It might not be, for example, if the minimum step is much larger
3449          than the number of bytes handled by one vector iteration.)  */
3450       if (!ignore_step_p
3451           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3452           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3453           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3454               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3455         {
3456           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3457           if (dump_enabled_p ())
3458             {
3459               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3460               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3461               dump_printf (MSG_NOTE, " and ");
3462               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3463               dump_printf (MSG_NOTE, " when the step ");
3464               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3465               dump_printf (MSG_NOTE, " is outside ");
3466               if (unsigned_p)
3467                 dump_printf (MSG_NOTE, "[0");
3468               else
3469                 {
3470                   dump_printf (MSG_NOTE, "(");
3471                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3472                 }
3473               dump_printf (MSG_NOTE, ", ");
3474               dump_dec (MSG_NOTE, lower_bound);
3475               dump_printf (MSG_NOTE, ")\n");
3476             }
3477           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3478                                   lower_bound);
3479           continue;
3480         }
3481
3482       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3483       if (dr_group_first_a)
3484         {
3485           stmt_a = dr_group_first_a;
3486           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3487         }
3488
3489       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3490       if (dr_group_first_b)
3491         {
3492           stmt_b = dr_group_first_b;
3493           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3494         }
3495
3496       if (ignore_step_p)
3497         {
3498           segment_length_a = size_zero_node;
3499           segment_length_b = size_zero_node;
3500         }
3501       else
3502         {
3503           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3504             length_factor = scalar_loop_iters;
3505           else
3506             length_factor = size_int (vect_factor);
3507           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3508           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3509         }
3510       access_size_a = vect_vfa_access_size (dr_a);
3511       access_size_b = vect_vfa_access_size (dr_b);
3512       align_a = vect_vfa_align (dr_a);
3513       align_b = vect_vfa_align (dr_b);
3514
3515       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3516                                         DR_BASE_ADDRESS (dr_b));
3517       if (comp_res == 0)
3518         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3519                                           DR_OFFSET (dr_b));
3520
3521       /* See whether the alias is known at compilation time.  */
3522       if (comp_res == 0
3523           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3524           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3525           && poly_int_tree_p (segment_length_a)
3526           && poly_int_tree_p (segment_length_b))
3527         {
3528           int res = vect_compile_time_alias (dr_a, dr_b,
3529                                              segment_length_a,
3530                                              segment_length_b,
3531                                              access_size_a,
3532                                              access_size_b);
3533           if (res >= 0 && dump_enabled_p ())
3534             {
3535               dump_printf_loc (MSG_NOTE, vect_location,
3536                                "can tell at compile time that ");
3537               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3538               dump_printf (MSG_NOTE, " and ");
3539               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3540               if (res == 0)
3541                 dump_printf (MSG_NOTE, " do not alias\n");
3542               else
3543                 dump_printf (MSG_NOTE, " alias\n");
3544             }
3545
3546           if (res == 0)
3547             continue;
3548
3549           if (res == 1)
3550             {
3551               if (dump_enabled_p ())
3552                 dump_printf_loc (MSG_NOTE, vect_location,
3553                                  "not vectorized: compilation time alias.\n");
3554               return false;
3555             }
3556         }
3557
3558       dr_with_seg_len_pair_t dr_with_seg_len_pair
3559         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3560          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3561
3562       /* Canonicalize pairs by sorting the two DR members.  */
3563       if (comp_res > 0)
3564         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3565
3566       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3567     }
3568
3569   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3570
3571   unsigned int count = (comp_alias_ddrs.length ()
3572                         + check_unequal_addrs.length ());
3573
3574   dump_printf_loc (MSG_NOTE, vect_location,
3575                    "improved number of alias checks from %d to %d\n",
3576                    may_alias_ddrs.length (), count);
3577   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3578     {
3579       if (dump_enabled_p ())
3580         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581                          "number of versioning for alias "
3582                          "run-time tests exceeds %d "
3583                          "(--param vect-max-version-for-alias-checks)\n",
3584                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3585       return false;
3586     }
3587
3588   return true;
3589 }
3590
3591 /* Check whether we can use an internal function for a gather load
3592    or scatter store.  READ_P is true for loads and false for stores.
3593    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3594    the type of the memory elements being loaded or stored.  OFFSET_BITS
3595    is the number of bits in each scalar offset and OFFSET_SIGN is the
3596    sign of the offset.  SCALE is the amount by which the offset should
3597    be multiplied *after* it has been converted to address width.
3598
3599    Return true if the function is supported, storing the function
3600    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3601
3602 bool
3603 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3604                           tree memory_type, unsigned int offset_bits,
3605                           signop offset_sign, int scale,
3606                           internal_fn *ifn_out, tree *element_type_out)
3607 {
3608   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3609   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3610   if (offset_bits > element_bits)
3611     /* Internal functions require the offset to be the same width as
3612        the vector elements.  We can extend narrower offsets, but it isn't
3613        safe to truncate wider offsets.  */
3614     return false;
3615
3616   if (element_bits != memory_bits)
3617     /* For now the vector elements must be the same width as the
3618        memory elements.  */
3619     return false;
3620
3621   /* Work out which function we need.  */
3622   internal_fn ifn;
3623   if (read_p)
3624     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3625   else
3626     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3627
3628   /* Test whether the target supports this combination.  */
3629   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3630                                                offset_sign, scale))
3631     return false;
3632
3633   *ifn_out = ifn;
3634   *element_type_out = TREE_TYPE (vectype);
3635   return true;
3636 }
3637
3638 /* CALL is a call to an internal gather load or scatter store function.
3639    Describe the operation in INFO.  */
3640
3641 static void
3642 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3643 {
3644   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3645   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3646   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3647
3648   info->ifn = gimple_call_internal_fn (call);
3649   info->decl = NULL_TREE;
3650   info->base = gimple_call_arg (call, 0);
3651   info->offset = gimple_call_arg (call, 1);
3652   info->offset_dt = vect_unknown_def_type;
3653   info->offset_vectype = NULL_TREE;
3654   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3655   info->element_type = TREE_TYPE (vectype);
3656   info->memory_type = TREE_TYPE (DR_REF (dr));
3657 }
3658
3659 /* Return true if a non-affine read or write in STMT is suitable for a
3660    gather load or scatter store.  Describe the operation in *INFO if so.  */
3661
3662 bool
3663 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3664                            gather_scatter_info *info)
3665 {
3666   HOST_WIDE_INT scale = 1;
3667   poly_int64 pbitpos, pbitsize;
3668   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3669   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3670   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3671   tree offtype = NULL_TREE;
3672   tree decl = NULL_TREE, base, off;
3673   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3674   tree memory_type = TREE_TYPE (DR_REF (dr));
3675   machine_mode pmode;
3676   int punsignedp, reversep, pvolatilep = 0;
3677   internal_fn ifn;
3678   tree element_type;
3679   bool masked_p = false;
3680
3681   /* See whether this is already a call to a gather/scatter internal function.
3682      If not, see whether it's a masked load or store.  */
3683   gcall *call = dyn_cast <gcall *> (stmt);
3684   if (call && gimple_call_internal_p (call))
3685     {
3686       ifn = gimple_call_internal_fn (stmt);
3687       if (internal_gather_scatter_fn_p (ifn))
3688         {
3689           vect_describe_gather_scatter_call (call, info);
3690           return true;
3691         }
3692       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3693     }
3694
3695   /* True if we should aim to use internal functions rather than
3696      built-in functions.  */
3697   bool use_ifn_p = (DR_IS_READ (dr)
3698                     ? supports_vec_gather_load_p ()
3699                     : supports_vec_scatter_store_p ());
3700
3701   base = DR_REF (dr);
3702   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3703      see if we can use the def stmt of the address.  */
3704   if (masked_p
3705       && TREE_CODE (base) == MEM_REF
3706       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3707       && integer_zerop (TREE_OPERAND (base, 1))
3708       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3709     {
3710       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3711       if (is_gimple_assign (def_stmt)
3712           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3713         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3714     }
3715
3716   /* The gather and scatter builtins need address of the form
3717      loop_invariant + vector * {1, 2, 4, 8}
3718      or
3719      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3720      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3721      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3722      multiplications and additions in it.  To get a vector, we need
3723      a single SSA_NAME that will be defined in the loop and will
3724      contain everything that is not loop invariant and that can be
3725      vectorized.  The following code attempts to find such a preexistng
3726      SSA_NAME OFF and put the loop invariants into a tree BASE
3727      that can be gimplified before the loop.  */
3728   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3729                               &punsignedp, &reversep, &pvolatilep);
3730   gcc_assert (base && !reversep);
3731   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3732
3733   if (TREE_CODE (base) == MEM_REF)
3734     {
3735       if (!integer_zerop (TREE_OPERAND (base, 1)))
3736         {
3737           if (off == NULL_TREE)
3738             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3739           else
3740             off = size_binop (PLUS_EXPR, off,
3741                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3742         }
3743       base = TREE_OPERAND (base, 0);
3744     }
3745   else
3746     base = build_fold_addr_expr (base);
3747
3748   if (off == NULL_TREE)
3749     off = size_zero_node;
3750
3751   /* If base is not loop invariant, either off is 0, then we start with just
3752      the constant offset in the loop invariant BASE and continue with base
3753      as OFF, otherwise give up.
3754      We could handle that case by gimplifying the addition of base + off
3755      into some SSA_NAME and use that as off, but for now punt.  */
3756   if (!expr_invariant_in_loop_p (loop, base))
3757     {
3758       if (!integer_zerop (off))
3759         return false;
3760       off = base;
3761       base = size_int (pbytepos);
3762     }
3763   /* Otherwise put base + constant offset into the loop invariant BASE
3764      and continue with OFF.  */
3765   else
3766     {
3767       base = fold_convert (sizetype, base);
3768       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3769     }
3770
3771   /* OFF at this point may be either a SSA_NAME or some tree expression
3772      from get_inner_reference.  Try to peel off loop invariants from it
3773      into BASE as long as possible.  */
3774   STRIP_NOPS (off);
3775   while (offtype == NULL_TREE)
3776     {
3777       enum tree_code code;
3778       tree op0, op1, add = NULL_TREE;
3779
3780       if (TREE_CODE (off) == SSA_NAME)
3781         {
3782           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3783
3784           if (expr_invariant_in_loop_p (loop, off))
3785             return false;
3786
3787           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3788             break;
3789
3790           op0 = gimple_assign_rhs1 (def_stmt);
3791           code = gimple_assign_rhs_code (def_stmt);
3792           op1 = gimple_assign_rhs2 (def_stmt);
3793         }
3794       else
3795         {
3796           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3797             return false;
3798           code = TREE_CODE (off);
3799           extract_ops_from_tree (off, &code, &op0, &op1);
3800         }
3801       switch (code)
3802         {
3803         case POINTER_PLUS_EXPR:
3804         case PLUS_EXPR:
3805           if (expr_invariant_in_loop_p (loop, op0))
3806             {
3807               add = op0;
3808               off = op1;
3809             do_add:
3810               add = fold_convert (sizetype, add);
3811               if (scale != 1)
3812                 add = size_binop (MULT_EXPR, add, size_int (scale));
3813               base = size_binop (PLUS_EXPR, base, add);
3814               continue;
3815             }
3816           if (expr_invariant_in_loop_p (loop, op1))
3817             {
3818               add = op1;
3819               off = op0;
3820               goto do_add;
3821             }
3822           break;
3823         case MINUS_EXPR:
3824           if (expr_invariant_in_loop_p (loop, op1))
3825             {
3826               add = fold_convert (sizetype, op1);
3827               add = size_binop (MINUS_EXPR, size_zero_node, add);
3828               off = op0;
3829               goto do_add;
3830             }
3831           break;
3832         case MULT_EXPR:
3833           if (scale == 1 && tree_fits_shwi_p (op1))
3834             {
3835               int new_scale = tree_to_shwi (op1);
3836               /* Only treat this as a scaling operation if the target
3837                  supports it.  */
3838               if (use_ifn_p
3839                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3840                                                 vectype, memory_type, 1,
3841                                                 TYPE_SIGN (TREE_TYPE (op0)),
3842                                                 new_scale, &ifn,
3843                                                 &element_type))
3844                 break;
3845               scale = new_scale;
3846               off = op0;
3847               continue;
3848             }
3849           break;
3850         case SSA_NAME:
3851           off = op0;
3852           continue;
3853         CASE_CONVERT:
3854           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3855               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3856             break;
3857           if (TYPE_PRECISION (TREE_TYPE (op0))
3858               == TYPE_PRECISION (TREE_TYPE (off)))
3859             {
3860               off = op0;
3861               continue;
3862             }
3863
3864           /* The internal functions need the offset to be the same width
3865              as the elements of VECTYPE.  Don't include operations that
3866              cast the offset from that width to a different width.  */
3867           if (use_ifn_p
3868               && (int_size_in_bytes (TREE_TYPE (vectype))
3869                   == int_size_in_bytes (TREE_TYPE (off))))
3870             break;
3871
3872           if (TYPE_PRECISION (TREE_TYPE (op0))
3873               < TYPE_PRECISION (TREE_TYPE (off)))
3874             {
3875               off = op0;
3876               offtype = TREE_TYPE (off);
3877               STRIP_NOPS (off);
3878               continue;
3879             }
3880           break;
3881         default:
3882           break;
3883         }
3884       break;
3885     }
3886
3887   /* If at the end OFF still isn't a SSA_NAME or isn't
3888      defined in the loop, punt.  */
3889   if (TREE_CODE (off) != SSA_NAME
3890       || expr_invariant_in_loop_p (loop, off))
3891     return false;
3892
3893   if (offtype == NULL_TREE)
3894     offtype = TREE_TYPE (off);
3895
3896   if (use_ifn_p)
3897     {
3898       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3899                                      memory_type, TYPE_PRECISION (offtype),
3900                                      TYPE_SIGN (offtype), scale, &ifn,
3901                                      &element_type))
3902         return false;
3903     }
3904   else
3905     {
3906       if (DR_IS_READ (dr))
3907         {
3908           if (targetm.vectorize.builtin_gather)
3909             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3910         }
3911       else
3912         {
3913           if (targetm.vectorize.builtin_scatter)
3914             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3915         }
3916
3917       if (!decl)
3918         return false;
3919
3920       ifn = IFN_LAST;
3921       element_type = TREE_TYPE (vectype);
3922     }
3923
3924   info->ifn = ifn;
3925   info->decl = decl;
3926   info->base = base;
3927   info->offset = off;
3928   info->offset_dt = vect_unknown_def_type;
3929   info->offset_vectype = NULL_TREE;
3930   info->scale = scale;
3931   info->element_type = element_type;
3932   info->memory_type = memory_type;
3933   return true;
3934 }
3935
3936 /* Function vect_analyze_data_refs.
3937
3938   Find all the data references in the loop or basic block.
3939
3940    The general structure of the analysis of data refs in the vectorizer is as
3941    follows:
3942    1- vect_analyze_data_refs(loop/bb): call
3943       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3944       in the loop/bb and their dependences.
3945    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3946    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3947    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3948
3949 */
3950
3951 bool
3952 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3953 {
3954   struct loop *loop = NULL;
3955   unsigned int i;
3956   struct data_reference *dr;
3957   tree scalar_type;
3958
3959   if (dump_enabled_p ())
3960     dump_printf_loc (MSG_NOTE, vect_location,
3961                      "=== vect_analyze_data_refs ===\n");
3962
3963   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3964     loop = LOOP_VINFO_LOOP (loop_vinfo);
3965
3966   /* Go through the data-refs, check that the analysis succeeded.  Update
3967      pointer from stmt_vec_info struct to DR and vectype.  */
3968
3969   vec<data_reference_p> datarefs = vinfo->datarefs;
3970   FOR_EACH_VEC_ELT (datarefs, i, dr)
3971     {
3972       gimple *stmt;
3973       stmt_vec_info stmt_info;
3974       tree base, offset, init;
3975       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3976       bool simd_lane_access = false;
3977       poly_uint64 vf;
3978
3979 again:
3980       if (!dr || !DR_REF (dr))
3981         {
3982           if (dump_enabled_p ())
3983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3984                              "not vectorized: unhandled data-ref\n");
3985           return false;
3986         }
3987
3988       stmt = DR_STMT (dr);
3989       stmt_info = vinfo_for_stmt (stmt);
3990
3991       /* Discard clobbers from the dataref vector.  We will remove
3992          clobber stmts during vectorization.  */
3993       if (gimple_clobber_p (stmt))
3994         {
3995           free_data_ref (dr);
3996           if (i == datarefs.length () - 1)
3997             {
3998               datarefs.pop ();
3999               break;
4000             }
4001           datarefs.ordered_remove (i);
4002           dr = datarefs[i];
4003           goto again;
4004         }
4005
4006       /* Check that analysis of the data-ref succeeded.  */
4007       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4008           || !DR_STEP (dr))
4009         {
4010           bool maybe_gather
4011             = DR_IS_READ (dr)
4012               && !TREE_THIS_VOLATILE (DR_REF (dr))
4013               && (targetm.vectorize.builtin_gather != NULL
4014                   || supports_vec_gather_load_p ());
4015           bool maybe_scatter
4016             = DR_IS_WRITE (dr)
4017               && !TREE_THIS_VOLATILE (DR_REF (dr))
4018               && (targetm.vectorize.builtin_scatter != NULL
4019                   || supports_vec_scatter_store_p ());
4020           bool maybe_simd_lane_access
4021             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4022
4023           /* If target supports vector gather loads or scatter stores, or if
4024              this might be a SIMD lane access, see if they can't be used.  */
4025           if (is_a <loop_vec_info> (vinfo)
4026               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4027               && !nested_in_vect_loop_p (loop, stmt))
4028             {
4029               struct data_reference *newdr
4030                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4031                                    DR_REF (dr), stmt, !maybe_scatter,
4032                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4033               gcc_assert (newdr != NULL && DR_REF (newdr));
4034               if (DR_BASE_ADDRESS (newdr)
4035                   && DR_OFFSET (newdr)
4036                   && DR_INIT (newdr)
4037                   && DR_STEP (newdr)
4038                   && integer_zerop (DR_STEP (newdr)))
4039                 {
4040                   if (maybe_simd_lane_access)
4041                     {
4042                       tree off = DR_OFFSET (newdr);
4043                       STRIP_NOPS (off);
4044                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4045                           && TREE_CODE (off) == MULT_EXPR
4046                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4047                         {
4048                           tree step = TREE_OPERAND (off, 1);
4049                           off = TREE_OPERAND (off, 0);
4050                           STRIP_NOPS (off);
4051                           if (CONVERT_EXPR_P (off)
4052                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4053                                                                           0)))
4054                                  < TYPE_PRECISION (TREE_TYPE (off)))
4055                             off = TREE_OPERAND (off, 0);
4056                           if (TREE_CODE (off) == SSA_NAME)
4057                             {
4058                               gimple *def = SSA_NAME_DEF_STMT (off);
4059                               tree reft = TREE_TYPE (DR_REF (newdr));
4060                               if (is_gimple_call (def)
4061                                   && gimple_call_internal_p (def)
4062                                   && (gimple_call_internal_fn (def)
4063                                       == IFN_GOMP_SIMD_LANE))
4064                                 {
4065                                   tree arg = gimple_call_arg (def, 0);
4066                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4067                                   arg = SSA_NAME_VAR (arg);
4068                                   if (arg == loop->simduid
4069                                       /* For now.  */
4070                                       && tree_int_cst_equal
4071                                            (TYPE_SIZE_UNIT (reft),
4072                                             step))
4073                                     {
4074                                       DR_OFFSET (newdr) = ssize_int (0);
4075                                       DR_STEP (newdr) = step;
4076                                       DR_OFFSET_ALIGNMENT (newdr)
4077                                         = BIGGEST_ALIGNMENT;
4078                                       DR_STEP_ALIGNMENT (newdr)
4079                                         = highest_pow2_factor (step);
4080                                       dr = newdr;
4081                                       simd_lane_access = true;
4082                                     }
4083                                 }
4084                             }
4085                         }
4086                     }
4087                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4088                     {
4089                       dr = newdr;
4090                       if (maybe_gather)
4091                         gatherscatter = GATHER;
4092                       else
4093                         gatherscatter = SCATTER;
4094                     }
4095                 }
4096               if (gatherscatter == SG_NONE && !simd_lane_access)
4097                 free_data_ref (newdr);
4098             }
4099
4100           if (gatherscatter == SG_NONE && !simd_lane_access)
4101             {
4102               if (dump_enabled_p ())
4103                 {
4104                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4105                                    "not vectorized: data ref analysis "
4106                                    "failed ");
4107                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4108                 }
4109
4110               if (is_a <bb_vec_info> (vinfo))
4111                 break;
4112
4113               return false;
4114             }
4115         }
4116
4117       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4118         {
4119           if (dump_enabled_p ())
4120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4121                              "not vectorized: base addr of dr is a "
4122                              "constant\n");
4123
4124           if (is_a <bb_vec_info> (vinfo))
4125             break;
4126
4127           if (gatherscatter != SG_NONE || simd_lane_access)
4128             free_data_ref (dr);
4129           return false;
4130         }
4131
4132       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4133         {
4134           if (dump_enabled_p ())
4135             {
4136               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4137                                "not vectorized: volatile type ");
4138               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4139             }
4140
4141           if (is_a <bb_vec_info> (vinfo))
4142             break;
4143
4144           return false;
4145         }
4146
4147       if (stmt_can_throw_internal (stmt))
4148         {
4149           if (dump_enabled_p ())
4150             {
4151               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152                                "not vectorized: statement can throw an "
4153                                "exception ");
4154               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4155             }
4156
4157           if (is_a <bb_vec_info> (vinfo))
4158             break;
4159
4160           if (gatherscatter != SG_NONE || simd_lane_access)
4161             free_data_ref (dr);
4162           return false;
4163         }
4164
4165       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4166           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4167         {
4168           if (dump_enabled_p ())
4169             {
4170               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4171                                "not vectorized: statement is bitfield "
4172                                "access ");
4173               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4174             }
4175
4176           if (is_a <bb_vec_info> (vinfo))
4177             break;
4178
4179           if (gatherscatter != SG_NONE || simd_lane_access)
4180             free_data_ref (dr);
4181           return false;
4182         }
4183
4184       base = unshare_expr (DR_BASE_ADDRESS (dr));
4185       offset = unshare_expr (DR_OFFSET (dr));
4186       init = unshare_expr (DR_INIT (dr));
4187
4188       if (is_gimple_call (stmt)
4189           && (!gimple_call_internal_p (stmt)
4190               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4191                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4192         {
4193           if (dump_enabled_p ())
4194             {
4195               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4196                                "not vectorized: dr in a call ");
4197               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4198             }
4199
4200           if (is_a <bb_vec_info> (vinfo))
4201             break;
4202
4203           if (gatherscatter != SG_NONE || simd_lane_access)
4204             free_data_ref (dr);
4205           return false;
4206         }
4207
4208       /* Update DR field in stmt_vec_info struct.  */
4209
4210       /* If the dataref is in an inner-loop of the loop that is considered for
4211          for vectorization, we also want to analyze the access relative to
4212          the outer-loop (DR contains information only relative to the
4213          inner-most enclosing loop).  We do that by building a reference to the
4214          first location accessed by the inner-loop, and analyze it relative to
4215          the outer-loop.  */
4216       if (loop && nested_in_vect_loop_p (loop, stmt))
4217         {
4218           /* Build a reference to the first location accessed by the
4219              inner loop: *(BASE + INIT + OFFSET).  By construction,
4220              this address must be invariant in the inner loop, so we
4221              can consider it as being used in the outer loop.  */
4222           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4223                                           init, offset);
4224           tree init_addr = fold_build_pointer_plus (base, init_offset);
4225           tree init_ref = build_fold_indirect_ref (init_addr);
4226
4227           if (dump_enabled_p ())
4228             {
4229               dump_printf_loc (MSG_NOTE, vect_location,
4230                                "analyze in outer loop: ");
4231               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4232               dump_printf (MSG_NOTE, "\n");
4233             }
4234
4235           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4236                                      init_ref, loop))
4237             /* dr_analyze_innermost already explained the failure.  */
4238             return false;
4239
4240           if (dump_enabled_p ())
4241             {
4242               dump_printf_loc (MSG_NOTE, vect_location,
4243                                "\touter base_address: ");
4244               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4246               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4247               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248                                  STMT_VINFO_DR_OFFSET (stmt_info));
4249               dump_printf (MSG_NOTE,
4250                            "\n\touter constant offset from base address: ");
4251               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4252                                  STMT_VINFO_DR_INIT (stmt_info));
4253               dump_printf (MSG_NOTE, "\n\touter step: ");
4254               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4255                                  STMT_VINFO_DR_STEP (stmt_info));
4256               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4257                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4258               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4259                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4260               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4261                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4262               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4263                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4264             }
4265         }
4266
4267       if (STMT_VINFO_DATA_REF (stmt_info))
4268         {
4269           if (dump_enabled_p ())
4270             {
4271               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4272                                "not vectorized: more than one data ref "
4273                                "in stmt: ");
4274               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4275             }
4276
4277           if (is_a <bb_vec_info> (vinfo))
4278             break;
4279
4280           if (gatherscatter != SG_NONE || simd_lane_access)
4281             free_data_ref (dr);
4282           return false;
4283         }
4284
4285       STMT_VINFO_DATA_REF (stmt_info) = dr;
4286       if (simd_lane_access)
4287         {
4288           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4289           free_data_ref (datarefs[i]);
4290           datarefs[i] = dr;
4291         }
4292
4293       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4294           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4295           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4296         {
4297           if (dump_enabled_p ())
4298             {
4299               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4300                                "not vectorized: base object not addressable "
4301                                "for stmt: ");
4302               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4303             }
4304           if (is_a <bb_vec_info> (vinfo))
4305             {
4306               /* In BB vectorization the ref can still participate
4307                  in dependence analysis, we just can't vectorize it.  */
4308               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4309               continue;
4310             }
4311           return false;
4312         }
4313
4314       /* Set vectype for STMT.  */
4315       scalar_type = TREE_TYPE (DR_REF (dr));
4316       STMT_VINFO_VECTYPE (stmt_info)
4317         = get_vectype_for_scalar_type (scalar_type);
4318       if (!STMT_VINFO_VECTYPE (stmt_info))
4319         {
4320           if (dump_enabled_p ())
4321             {
4322               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4323                                "not vectorized: no vectype for stmt: ");
4324               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4325               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4326               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4327                                  scalar_type);
4328               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4329             }
4330
4331           if (is_a <bb_vec_info> (vinfo))
4332             {
4333               /* No vector type is fine, the ref can still participate
4334                  in dependence analysis, we just can't vectorize it.  */
4335               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4336               continue;
4337             }
4338
4339           if (gatherscatter != SG_NONE || simd_lane_access)
4340             {
4341               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4342               if (gatherscatter != SG_NONE)
4343                 free_data_ref (dr);
4344             }
4345           return false;
4346         }
4347       else
4348         {
4349           if (dump_enabled_p ())
4350             {
4351               dump_printf_loc (MSG_NOTE, vect_location,
4352                                "got vectype for stmt: ");
4353               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4354               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4355                                  STMT_VINFO_VECTYPE (stmt_info));
4356               dump_printf (MSG_NOTE, "\n");
4357             }
4358         }
4359
4360       /* Adjust the minimal vectorization factor according to the
4361          vector type.  */
4362       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4363       *min_vf = upper_bound (*min_vf, vf);
4364
4365       if (gatherscatter != SG_NONE)
4366         {
4367           gather_scatter_info gs_info;
4368           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4369                                           &gs_info)
4370               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4371             {
4372               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4373               free_data_ref (dr);
4374               if (dump_enabled_p ())
4375                 {
4376                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4377                                    (gatherscatter == GATHER) ?
4378                                    "not vectorized: not suitable for gather "
4379                                    "load " :
4380                                    "not vectorized: not suitable for scatter "
4381                                    "store ");
4382                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4383                 }
4384               return false;
4385             }
4386
4387           free_data_ref (datarefs[i]);
4388           datarefs[i] = dr;
4389           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4390         }
4391
4392       else if (is_a <loop_vec_info> (vinfo)
4393                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4394         {
4395           if (nested_in_vect_loop_p (loop, stmt))
4396             {
4397               if (dump_enabled_p ())
4398                 {
4399                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4400                                    "not vectorized: not suitable for strided "
4401                                    "load ");
4402                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4403                 }
4404               return false;
4405             }
4406           STMT_VINFO_STRIDED_P (stmt_info) = true;
4407         }
4408     }
4409
4410   /* If we stopped analysis at the first dataref we could not analyze
4411      when trying to vectorize a basic-block mark the rest of the datarefs
4412      as not vectorizable and truncate the vector of datarefs.  That
4413      avoids spending useless time in analyzing their dependence.  */
4414   if (i != datarefs.length ())
4415     {
4416       gcc_assert (is_a <bb_vec_info> (vinfo));
4417       for (unsigned j = i; j < datarefs.length (); ++j)
4418         {
4419           data_reference_p dr = datarefs[j];
4420           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4421           free_data_ref (dr);
4422         }
4423       datarefs.truncate (i);
4424     }
4425
4426   return true;
4427 }
4428
4429
4430 /* Function vect_get_new_vect_var.
4431
4432    Returns a name for a new variable.  The current naming scheme appends the
4433    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4434    the name of vectorizer generated variables, and appends that to NAME if
4435    provided.  */
4436
4437 tree
4438 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4439 {
4440   const char *prefix;
4441   tree new_vect_var;
4442
4443   switch (var_kind)
4444   {
4445   case vect_simple_var:
4446     prefix = "vect";
4447     break;
4448   case vect_scalar_var:
4449     prefix = "stmp";
4450     break;
4451   case vect_mask_var:
4452     prefix = "mask";
4453     break;
4454   case vect_pointer_var:
4455     prefix = "vectp";
4456     break;
4457   default:
4458     gcc_unreachable ();
4459   }
4460
4461   if (name)
4462     {
4463       char* tmp = concat (prefix, "_", name, NULL);
4464       new_vect_var = create_tmp_reg (type, tmp);
4465       free (tmp);
4466     }
4467   else
4468     new_vect_var = create_tmp_reg (type, prefix);
4469
4470   return new_vect_var;
4471 }
4472
4473 /* Like vect_get_new_vect_var but return an SSA name.  */
4474
4475 tree
4476 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4477 {
4478   const char *prefix;
4479   tree new_vect_var;
4480
4481   switch (var_kind)
4482   {
4483   case vect_simple_var:
4484     prefix = "vect";
4485     break;
4486   case vect_scalar_var:
4487     prefix = "stmp";
4488     break;
4489   case vect_pointer_var:
4490     prefix = "vectp";
4491     break;
4492   default:
4493     gcc_unreachable ();
4494   }
4495
4496   if (name)
4497     {
4498       char* tmp = concat (prefix, "_", name, NULL);
4499       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4500       free (tmp);
4501     }
4502   else
4503     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4504
4505   return new_vect_var;
4506 }
4507
4508 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4509
4510 static void
4511 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4512 {
4513   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4514   int misalign = DR_MISALIGNMENT (dr);
4515   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4516     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4517   else
4518     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4519                             DR_TARGET_ALIGNMENT (dr), misalign);
4520 }
4521
4522 /* Function vect_create_addr_base_for_vector_ref.
4523
4524    Create an expression that computes the address of the first memory location
4525    that will be accessed for a data reference.
4526
4527    Input:
4528    STMT: The statement containing the data reference.
4529    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4530    OFFSET: Optional. If supplied, it is be added to the initial address.
4531    LOOP:    Specify relative to which loop-nest should the address be computed.
4532             For example, when the dataref is in an inner-loop nested in an
4533             outer-loop that is now being vectorized, LOOP can be either the
4534             outer-loop, or the inner-loop.  The first memory location accessed
4535             by the following dataref ('in' points to short):
4536
4537                 for (i=0; i<N; i++)
4538                    for (j=0; j<M; j++)
4539                      s += in[i+j]
4540
4541             is as follows:
4542             if LOOP=i_loop:     &in             (relative to i_loop)
4543             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4544    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4545             initial address.  Unlike OFFSET, which is number of elements to
4546             be added, BYTE_OFFSET is measured in bytes.
4547
4548    Output:
4549    1. Return an SSA_NAME whose value is the address of the memory location of
4550       the first vector of the data reference.
4551    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4552       these statement(s) which define the returned SSA_NAME.
4553
4554    FORNOW: We are only handling array accesses with step 1.  */
4555
4556 tree
4557 vect_create_addr_base_for_vector_ref (gimple *stmt,
4558                                       gimple_seq *new_stmt_list,
4559                                       tree offset,
4560                                       tree byte_offset)
4561 {
4562   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4563   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4564   const char *base_name;
4565   tree addr_base;
4566   tree dest;
4567   gimple_seq seq = NULL;
4568   tree vect_ptr_type;
4569   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4570   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4571   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4572
4573   tree data_ref_base = unshare_expr (drb->base_address);
4574   tree base_offset = unshare_expr (drb->offset);
4575   tree init = unshare_expr (drb->init);
4576
4577   if (loop_vinfo)
4578     base_name = get_name (data_ref_base);
4579   else
4580     {
4581       base_offset = ssize_int (0);
4582       init = ssize_int (0);
4583       base_name = get_name (DR_REF (dr));
4584     }
4585
4586   /* Create base_offset */
4587   base_offset = size_binop (PLUS_EXPR,
4588                             fold_convert (sizetype, base_offset),
4589                             fold_convert (sizetype, init));
4590
4591   if (offset)
4592     {
4593       offset = fold_build2 (MULT_EXPR, sizetype,
4594                             fold_convert (sizetype, offset), step);
4595       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4596                                  base_offset, offset);
4597     }
4598   if (byte_offset)
4599     {
4600       byte_offset = fold_convert (sizetype, byte_offset);
4601       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4602                                  base_offset, byte_offset);
4603     }
4604
4605   /* base + base_offset */
4606   if (loop_vinfo)
4607     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4608   else
4609     {
4610       addr_base = build1 (ADDR_EXPR,
4611                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4612                           unshare_expr (DR_REF (dr)));
4613     }
4614
4615   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4616   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4617   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4618   gimple_seq_add_seq (new_stmt_list, seq);
4619
4620   if (DR_PTR_INFO (dr)
4621       && TREE_CODE (addr_base) == SSA_NAME
4622       && !SSA_NAME_PTR_INFO (addr_base))
4623     {
4624       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4625       if (offset || byte_offset)
4626         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4627     }
4628
4629   if (dump_enabled_p ())
4630     {
4631       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4632       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4633       dump_printf (MSG_NOTE, "\n");
4634     }
4635
4636   return addr_base;
4637 }
4638
4639
4640 /* Function vect_create_data_ref_ptr.
4641
4642    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4643    location accessed in the loop by STMT, along with the def-use update
4644    chain to appropriately advance the pointer through the loop iterations.
4645    Also set aliasing information for the pointer.  This pointer is used by
4646    the callers to this function to create a memory reference expression for
4647    vector load/store access.
4648
4649    Input:
4650    1. STMT: a stmt that references memory. Expected to be of the form
4651          GIMPLE_ASSIGN <name, data-ref> or
4652          GIMPLE_ASSIGN <data-ref, name>.
4653    2. AGGR_TYPE: the type of the reference, which should be either a vector
4654         or an array.
4655    3. AT_LOOP: the loop where the vector memref is to be created.
4656    4. OFFSET (optional): an offset to be added to the initial address accessed
4657         by the data-ref in STMT.
4658    5. BSI: location where the new stmts are to be placed if there is no loop
4659    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4660         pointing to the initial address.
4661    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4662         to the initial address accessed by the data-ref in STMT.  This is
4663         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4664         in bytes.
4665    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4666         to the IV during each iteration of the loop.  NULL says to move
4667         by one copy of AGGR_TYPE up or down, depending on the step of the
4668         data reference.
4669
4670    Output:
4671    1. Declare a new ptr to vector_type, and have it point to the base of the
4672       data reference (initial addressed accessed by the data reference).
4673       For example, for vector of type V8HI, the following code is generated:
4674
4675       v8hi *ap;
4676       ap = (v8hi *)initial_address;
4677
4678       if OFFSET is not supplied:
4679          initial_address = &a[init];
4680       if OFFSET is supplied:
4681          initial_address = &a[init + OFFSET];
4682       if BYTE_OFFSET is supplied:
4683          initial_address = &a[init] + BYTE_OFFSET;
4684
4685       Return the initial_address in INITIAL_ADDRESS.
4686
4687    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4688       update the pointer in each iteration of the loop.
4689
4690       Return the increment stmt that updates the pointer in PTR_INCR.
4691
4692    3. Set INV_P to true if the access pattern of the data reference in the
4693       vectorized loop is invariant.  Set it to false otherwise.
4694
4695    4. Return the pointer.  */
4696
4697 tree
4698 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4699                           tree offset, tree *initial_address,
4700                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4701                           bool only_init, bool *inv_p, tree byte_offset,
4702                           tree iv_step)
4703 {
4704   const char *base_name;
4705   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4706   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4707   struct loop *loop = NULL;
4708   bool nested_in_vect_loop = false;
4709   struct loop *containing_loop = NULL;
4710   tree aggr_ptr_type;
4711   tree aggr_ptr;
4712   tree new_temp;
4713   gimple_seq new_stmt_list = NULL;
4714   edge pe = NULL;
4715   basic_block new_bb;
4716   tree aggr_ptr_init;
4717   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4718   tree aptr;
4719   gimple_stmt_iterator incr_gsi;
4720   bool insert_after;
4721   tree indx_before_incr, indx_after_incr;
4722   gimple *incr;
4723   tree step;
4724   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4725
4726   gcc_assert (iv_step != NULL_TREE
4727               || TREE_CODE (aggr_type) == ARRAY_TYPE
4728               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4729
4730   if (loop_vinfo)
4731     {
4732       loop = LOOP_VINFO_LOOP (loop_vinfo);
4733       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4734       containing_loop = (gimple_bb (stmt))->loop_father;
4735       pe = loop_preheader_edge (loop);
4736     }
4737   else
4738     {
4739       gcc_assert (bb_vinfo);
4740       only_init = true;
4741       *ptr_incr = NULL;
4742     }
4743
4744   /* Check the step (evolution) of the load in LOOP, and record
4745      whether it's invariant.  */
4746   step = vect_dr_behavior (dr)->step;
4747   if (integer_zerop (step))
4748     *inv_p = true;
4749   else
4750     *inv_p = false;
4751
4752   /* Create an expression for the first address accessed by this load
4753      in LOOP.  */
4754   base_name = get_name (DR_BASE_ADDRESS (dr));
4755
4756   if (dump_enabled_p ())
4757     {
4758       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4759       dump_printf_loc (MSG_NOTE, vect_location,
4760                        "create %s-pointer variable to type: ",
4761                        get_tree_code_name (TREE_CODE (aggr_type)));
4762       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4763       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4764         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4765       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4766         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4767       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4768         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4769       else
4770         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4771       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4772       dump_printf (MSG_NOTE, "\n");
4773     }
4774
4775   /* (1) Create the new aggregate-pointer variable.
4776      Vector and array types inherit the alias set of their component
4777      type by default so we need to use a ref-all pointer if the data
4778      reference does not conflict with the created aggregated data
4779      reference because it is not addressable.  */
4780   bool need_ref_all = false;
4781   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4782                               get_alias_set (DR_REF (dr))))
4783     need_ref_all = true;
4784   /* Likewise for any of the data references in the stmt group.  */
4785   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4786     {
4787       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4788       do
4789         {
4790           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4791           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4792           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4793                                       get_alias_set (DR_REF (sdr))))
4794             {
4795               need_ref_all = true;
4796               break;
4797             }
4798           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4799         }
4800       while (orig_stmt);
4801     }
4802   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4803                                                need_ref_all);
4804   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4805
4806
4807   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4808      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4809      def-use update cycles for the pointer: one relative to the outer-loop
4810      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4811      to the inner-loop (which is the inner-most loop containing the dataref),
4812      and this is done be step (5) below.
4813
4814      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4815      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4816      redundant.  Steps (3),(4) create the following:
4817
4818         vp0 = &base_addr;
4819         LOOP:   vp1 = phi(vp0,vp2)
4820                 ...
4821                 ...
4822                 vp2 = vp1 + step
4823                 goto LOOP
4824
4825      If there is an inner-loop nested in loop, then step (5) will also be
4826      applied, and an additional update in the inner-loop will be created:
4827
4828         vp0 = &base_addr;
4829         LOOP:   vp1 = phi(vp0,vp2)
4830                 ...
4831         inner:     vp3 = phi(vp1,vp4)
4832                    vp4 = vp3 + inner_step
4833                    if () goto inner
4834                 ...
4835                 vp2 = vp1 + step
4836                 if () goto LOOP   */
4837
4838   /* (2) Calculate the initial address of the aggregate-pointer, and set
4839      the aggregate-pointer to point to it before the loop.  */
4840
4841   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4842
4843   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4844                                                    offset, byte_offset);
4845   if (new_stmt_list)
4846     {
4847       if (pe)
4848         {
4849           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4850           gcc_assert (!new_bb);
4851         }
4852       else
4853         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4854     }
4855
4856   *initial_address = new_temp;
4857   aggr_ptr_init = new_temp;
4858
4859   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4860      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4861      inner-loop nested in LOOP (during outer-loop vectorization).  */
4862
4863   /* No update in loop is required.  */
4864   if (only_init && (!loop_vinfo || at_loop == loop))
4865     aptr = aggr_ptr_init;
4866   else
4867     {
4868       if (iv_step == NULL_TREE)
4869         {
4870           /* The step of the aggregate pointer is the type size.  */
4871           iv_step = TYPE_SIZE_UNIT (aggr_type);
4872           /* One exception to the above is when the scalar step of the load in
4873              LOOP is zero. In this case the step here is also zero.  */
4874           if (*inv_p)
4875             iv_step = size_zero_node;
4876           else if (tree_int_cst_sgn (step) == -1)
4877             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4878         }
4879
4880       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4881
4882       create_iv (aggr_ptr_init,
4883                  fold_convert (aggr_ptr_type, iv_step),
4884                  aggr_ptr, loop, &incr_gsi, insert_after,
4885                  &indx_before_incr, &indx_after_incr);
4886       incr = gsi_stmt (incr_gsi);
4887       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4888
4889       /* Copy the points-to information if it exists. */
4890       if (DR_PTR_INFO (dr))
4891         {
4892           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4893           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4894         }
4895       if (ptr_incr)
4896         *ptr_incr = incr;
4897
4898       aptr = indx_before_incr;
4899     }
4900
4901   if (!nested_in_vect_loop || only_init)
4902     return aptr;
4903
4904
4905   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4906      nested in LOOP, if exists.  */
4907
4908   gcc_assert (nested_in_vect_loop);
4909   if (!only_init)
4910     {
4911       standard_iv_increment_position (containing_loop, &incr_gsi,
4912                                       &insert_after);
4913       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4914                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4915                  &indx_after_incr);
4916       incr = gsi_stmt (incr_gsi);
4917       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4918
4919       /* Copy the points-to information if it exists. */
4920       if (DR_PTR_INFO (dr))
4921         {
4922           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4923           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4924         }
4925       if (ptr_incr)
4926         *ptr_incr = incr;
4927
4928       return indx_before_incr;
4929     }
4930   else
4931     gcc_unreachable ();
4932 }
4933
4934
4935 /* Function bump_vector_ptr
4936
4937    Increment a pointer (to a vector type) by vector-size. If requested,
4938    i.e. if PTR-INCR is given, then also connect the new increment stmt
4939    to the existing def-use update-chain of the pointer, by modifying
4940    the PTR_INCR as illustrated below:
4941
4942    The pointer def-use update-chain before this function:
4943                         DATAREF_PTR = phi (p_0, p_2)
4944                         ....
4945         PTR_INCR:       p_2 = DATAREF_PTR + step
4946
4947    The pointer def-use update-chain after this function:
4948                         DATAREF_PTR = phi (p_0, p_2)
4949                         ....
4950                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4951                         ....
4952         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4953
4954    Input:
4955    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4956                  in the loop.
4957    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4958               the loop.  The increment amount across iterations is expected
4959               to be vector_size.
4960    BSI - location where the new update stmt is to be placed.
4961    STMT - the original scalar memory-access stmt that is being vectorized.
4962    BUMP - optional. The offset by which to bump the pointer. If not given,
4963           the offset is assumed to be vector_size.
4964
4965    Output: Return NEW_DATAREF_PTR as illustrated above.
4966
4967 */
4968
4969 tree
4970 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4971                  gimple *stmt, tree bump)
4972 {
4973   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4974   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4975   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4976   tree update = TYPE_SIZE_UNIT (vectype);
4977   gassign *incr_stmt;
4978   ssa_op_iter iter;
4979   use_operand_p use_p;
4980   tree new_dataref_ptr;
4981
4982   if (bump)
4983     update = bump;
4984
4985   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4986     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4987   else
4988     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4989   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4990                                    dataref_ptr, update);
4991   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4992
4993   /* Copy the points-to information if it exists. */
4994   if (DR_PTR_INFO (dr))
4995     {
4996       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4997       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4998     }
4999
5000   if (!ptr_incr)
5001     return new_dataref_ptr;
5002
5003   /* Update the vector-pointer's cross-iteration increment.  */
5004   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5005     {
5006       tree use = USE_FROM_PTR (use_p);
5007
5008       if (use == dataref_ptr)
5009         SET_USE (use_p, new_dataref_ptr);
5010       else
5011         gcc_assert (operand_equal_p (use, update, 0));
5012     }
5013
5014   return new_dataref_ptr;
5015 }
5016
5017
5018 /* Copy memory reference info such as base/clique from the SRC reference
5019    to the DEST MEM_REF.  */
5020
5021 void
5022 vect_copy_ref_info (tree dest, tree src)
5023 {
5024   if (TREE_CODE (dest) != MEM_REF)
5025     return;
5026
5027   tree src_base = src;
5028   while (handled_component_p (src_base))
5029     src_base = TREE_OPERAND (src_base, 0);
5030   if (TREE_CODE (src_base) != MEM_REF
5031       && TREE_CODE (src_base) != TARGET_MEM_REF)
5032     return;
5033
5034   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5035   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5036 }
5037
5038
5039 /* Function vect_create_destination_var.
5040
5041    Create a new temporary of type VECTYPE.  */
5042
5043 tree
5044 vect_create_destination_var (tree scalar_dest, tree vectype)
5045 {
5046   tree vec_dest;
5047   const char *name;
5048   char *new_name;
5049   tree type;
5050   enum vect_var_kind kind;
5051
5052   kind = vectype
5053     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5054     ? vect_mask_var
5055     : vect_simple_var
5056     : vect_scalar_var;
5057   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5058
5059   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5060
5061   name = get_name (scalar_dest);
5062   if (name)
5063     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5064   else
5065     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5066   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5067   free (new_name);
5068
5069   return vec_dest;
5070 }
5071
5072 /* Function vect_grouped_store_supported.
5073
5074    Returns TRUE if interleave high and interleave low permutations
5075    are supported, and FALSE otherwise.  */
5076
5077 bool
5078 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5079 {
5080   machine_mode mode = TYPE_MODE (vectype);
5081
5082   /* vect_permute_store_chain requires the group size to be equal to 3 or
5083      be a power of two.  */
5084   if (count != 3 && exact_log2 (count) == -1)
5085     {
5086       if (dump_enabled_p ())
5087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5088                          "the size of the group of accesses"
5089                          " is not a power of 2 or not eqaul to 3\n");
5090       return false;
5091     }
5092
5093   /* Check that the permutation is supported.  */
5094   if (VECTOR_MODE_P (mode))
5095     {
5096       unsigned int i;
5097       if (count == 3)
5098         {
5099           unsigned int j0 = 0, j1 = 0, j2 = 0;
5100           unsigned int i, j;
5101
5102           unsigned int nelt;
5103           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5104             {
5105               if (dump_enabled_p ())
5106                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5107                                  "cannot handle groups of 3 stores for"
5108                                  " variable-length vectors\n");
5109               return false;
5110             }
5111
5112           vec_perm_builder sel (nelt, nelt, 1);
5113           sel.quick_grow (nelt);
5114           vec_perm_indices indices;
5115           for (j = 0; j < 3; j++)
5116             {
5117               int nelt0 = ((3 - j) * nelt) % 3;
5118               int nelt1 = ((3 - j) * nelt + 1) % 3;
5119               int nelt2 = ((3 - j) * nelt + 2) % 3;
5120               for (i = 0; i < nelt; i++)
5121                 {
5122                   if (3 * i + nelt0 < nelt)
5123                     sel[3 * i + nelt0] = j0++;
5124                   if (3 * i + nelt1 < nelt)
5125                     sel[3 * i + nelt1] = nelt + j1++;
5126                   if (3 * i + nelt2 < nelt)
5127                     sel[3 * i + nelt2] = 0;
5128                 }
5129               indices.new_vector (sel, 2, nelt);
5130               if (!can_vec_perm_const_p (mode, indices))
5131                 {
5132                   if (dump_enabled_p ())
5133                     dump_printf (MSG_MISSED_OPTIMIZATION,
5134                                  "permutation op not supported by target.\n");
5135                   return false;
5136                 }
5137
5138               for (i = 0; i < nelt; i++)
5139                 {
5140                   if (3 * i + nelt0 < nelt)
5141                     sel[3 * i + nelt0] = 3 * i + nelt0;
5142                   if (3 * i + nelt1 < nelt)
5143                     sel[3 * i + nelt1] = 3 * i + nelt1;
5144                   if (3 * i + nelt2 < nelt)
5145                     sel[3 * i + nelt2] = nelt + j2++;
5146                 }
5147               indices.new_vector (sel, 2, nelt);
5148               if (!can_vec_perm_const_p (mode, indices))
5149                 {
5150                   if (dump_enabled_p ())
5151                     dump_printf (MSG_MISSED_OPTIMIZATION,
5152                                  "permutation op not supported by target.\n");
5153                   return false;
5154                 }
5155             }
5156           return true;
5157         }
5158       else
5159         {
5160           /* If length is not equal to 3 then only power of 2 is supported.  */
5161           gcc_assert (pow2p_hwi (count));
5162           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5163
5164           /* The encoding has 2 interleaved stepped patterns.  */
5165           vec_perm_builder sel (nelt, 2, 3);
5166           sel.quick_grow (6);
5167           for (i = 0; i < 3; i++)
5168             {
5169               sel[i * 2] = i;
5170               sel[i * 2 + 1] = i + nelt;
5171             }
5172           vec_perm_indices indices (sel, 2, nelt);
5173           if (can_vec_perm_const_p (mode, indices))
5174             {
5175               for (i = 0; i < 6; i++)
5176                 sel[i] += exact_div (nelt, 2);
5177               indices.new_vector (sel, 2, nelt);
5178               if (can_vec_perm_const_p (mode, indices))
5179                 return true;
5180             }
5181         }
5182     }
5183
5184   if (dump_enabled_p ())
5185     dump_printf (MSG_MISSED_OPTIMIZATION,
5186                  "permutaion op not supported by target.\n");
5187   return false;
5188 }
5189
5190
5191 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5192    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5193
5194 bool
5195 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5196                             bool masked_p)
5197 {
5198   if (masked_p)
5199     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5200                                          vec_mask_store_lanes_optab,
5201                                          vectype, count);
5202   else
5203     return vect_lanes_optab_supported_p ("vec_store_lanes",
5204                                          vec_store_lanes_optab,
5205                                          vectype, count);
5206 }
5207
5208
5209 /* Function vect_permute_store_chain.
5210
5211    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5212    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5213    the data correctly for the stores.  Return the final references for stores
5214    in RESULT_CHAIN.
5215
5216    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5217    The input is 4 vectors each containing 8 elements.  We assign a number to
5218    each element, the input sequence is:
5219
5220    1st vec:   0  1  2  3  4  5  6  7
5221    2nd vec:   8  9 10 11 12 13 14 15
5222    3rd vec:  16 17 18 19 20 21 22 23
5223    4th vec:  24 25 26 27 28 29 30 31
5224
5225    The output sequence should be:
5226
5227    1st vec:  0  8 16 24  1  9 17 25
5228    2nd vec:  2 10 18 26  3 11 19 27
5229    3rd vec:  4 12 20 28  5 13 21 30
5230    4th vec:  6 14 22 30  7 15 23 31
5231
5232    i.e., we interleave the contents of the four vectors in their order.
5233
5234    We use interleave_high/low instructions to create such output.  The input of
5235    each interleave_high/low operation is two vectors:
5236    1st vec    2nd vec
5237    0 1 2 3    4 5 6 7
5238    the even elements of the result vector are obtained left-to-right from the
5239    high/low elements of the first vector.  The odd elements of the result are
5240    obtained left-to-right from the high/low elements of the second vector.
5241    The output of interleave_high will be:   0 4 1 5
5242    and of interleave_low:                   2 6 3 7
5243
5244
5245    The permutation is done in log LENGTH stages.  In each stage interleave_high
5246    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5247    where the first argument is taken from the first half of DR_CHAIN and the
5248    second argument from it's second half.
5249    In our example,
5250
5251    I1: interleave_high (1st vec, 3rd vec)
5252    I2: interleave_low (1st vec, 3rd vec)
5253    I3: interleave_high (2nd vec, 4th vec)
5254    I4: interleave_low (2nd vec, 4th vec)
5255
5256    The output for the first stage is:
5257
5258    I1:  0 16  1 17  2 18  3 19
5259    I2:  4 20  5 21  6 22  7 23
5260    I3:  8 24  9 25 10 26 11 27
5261    I4: 12 28 13 29 14 30 15 31
5262
5263    The output of the second stage, i.e. the final result is:
5264
5265    I1:  0  8 16 24  1  9 17 25
5266    I2:  2 10 18 26  3 11 19 27
5267    I3:  4 12 20 28  5 13 21 30
5268    I4:  6 14 22 30  7 15 23 31.  */
5269
5270 void
5271 vect_permute_store_chain (vec<tree> dr_chain,
5272                           unsigned int length,
5273                           gimple *stmt,
5274                           gimple_stmt_iterator *gsi,
5275                           vec<tree> *result_chain)
5276 {
5277   tree vect1, vect2, high, low;
5278   gimple *perm_stmt;
5279   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5280   tree perm_mask_low, perm_mask_high;
5281   tree data_ref;
5282   tree perm3_mask_low, perm3_mask_high;
5283   unsigned int i, j, n, log_length = exact_log2 (length);
5284
5285   result_chain->quick_grow (length);
5286   memcpy (result_chain->address (), dr_chain.address (),
5287           length * sizeof (tree));
5288
5289   if (length == 3)
5290     {
5291       /* vect_grouped_store_supported ensures that this is constant.  */
5292       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5293       unsigned int j0 = 0, j1 = 0, j2 = 0;
5294
5295       vec_perm_builder sel (nelt, nelt, 1);
5296       sel.quick_grow (nelt);
5297       vec_perm_indices indices;
5298       for (j = 0; j < 3; j++)
5299         {
5300           int nelt0 = ((3 - j) * nelt) % 3;
5301           int nelt1 = ((3 - j) * nelt + 1) % 3;
5302           int nelt2 = ((3 - j) * nelt + 2) % 3;
5303
5304           for (i = 0; i < nelt; i++)
5305             {
5306               if (3 * i + nelt0 < nelt)
5307                 sel[3 * i + nelt0] = j0++;
5308               if (3 * i + nelt1 < nelt)
5309                 sel[3 * i + nelt1] = nelt + j1++;
5310               if (3 * i + nelt2 < nelt)
5311                 sel[3 * i + nelt2] = 0;
5312             }
5313           indices.new_vector (sel, 2, nelt);
5314           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5315
5316           for (i = 0; i < nelt; i++)
5317             {
5318               if (3 * i + nelt0 < nelt)
5319                 sel[3 * i + nelt0] = 3 * i + nelt0;
5320               if (3 * i + nelt1 < nelt)
5321                 sel[3 * i + nelt1] = 3 * i + nelt1;
5322               if (3 * i + nelt2 < nelt)
5323                 sel[3 * i + nelt2] = nelt + j2++;
5324             }
5325           indices.new_vector (sel, 2, nelt);
5326           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5327
5328           vect1 = dr_chain[0];
5329           vect2 = dr_chain[1];
5330
5331           /* Create interleaving stmt:
5332              low = VEC_PERM_EXPR <vect1, vect2,
5333                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5334                                    j + 2, nelt + j + 2, *, ...}>  */
5335           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5336           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5337                                            vect2, perm3_mask_low);
5338           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5339
5340           vect1 = data_ref;
5341           vect2 = dr_chain[2];
5342           /* Create interleaving stmt:
5343              low = VEC_PERM_EXPR <vect1, vect2,
5344                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5345                                    6, 7, nelt + j + 2, ...}>  */
5346           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5347           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5348                                            vect2, perm3_mask_high);
5349           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5350           (*result_chain)[j] = data_ref;
5351         }
5352     }
5353   else
5354     {
5355       /* If length is not equal to 3 then only power of 2 is supported.  */
5356       gcc_assert (pow2p_hwi (length));
5357
5358       /* The encoding has 2 interleaved stepped patterns.  */
5359       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5360       vec_perm_builder sel (nelt, 2, 3);
5361       sel.quick_grow (6);
5362       for (i = 0; i < 3; i++)
5363         {
5364           sel[i * 2] = i;
5365           sel[i * 2 + 1] = i + nelt;
5366         }
5367         vec_perm_indices indices (sel, 2, nelt);
5368         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5369
5370         for (i = 0; i < 6; i++)
5371           sel[i] += exact_div (nelt, 2);
5372         indices.new_vector (sel, 2, nelt);
5373         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5374
5375         for (i = 0, n = log_length; i < n; i++)
5376           {
5377             for (j = 0; j < length/2; j++)
5378               {
5379                 vect1 = dr_chain[j];
5380                 vect2 = dr_chain[j+length/2];
5381
5382                 /* Create interleaving stmt:
5383                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5384                                                         ...}>  */
5385                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5386                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5387                                                  vect2, perm_mask_high);
5388                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5389                 (*result_chain)[2*j] = high;
5390
5391                 /* Create interleaving stmt:
5392                    low = VEC_PERM_EXPR <vect1, vect2,
5393                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5394                                          ...}>  */
5395                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5396                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5397                                                  vect2, perm_mask_low);
5398                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5399                 (*result_chain)[2*j+1] = low;
5400               }
5401             memcpy (dr_chain.address (), result_chain->address (),
5402                     length * sizeof (tree));
5403           }
5404     }
5405 }
5406
5407 /* Function vect_setup_realignment
5408
5409    This function is called when vectorizing an unaligned load using
5410    the dr_explicit_realign[_optimized] scheme.
5411    This function generates the following code at the loop prolog:
5412
5413       p = initial_addr;
5414    x  msq_init = *(floor(p));   # prolog load
5415       realignment_token = call target_builtin;
5416     loop:
5417    x  msq = phi (msq_init, ---)
5418
5419    The stmts marked with x are generated only for the case of
5420    dr_explicit_realign_optimized.
5421
5422    The code above sets up a new (vector) pointer, pointing to the first
5423    location accessed by STMT, and a "floor-aligned" load using that pointer.
5424    It also generates code to compute the "realignment-token" (if the relevant
5425    target hook was defined), and creates a phi-node at the loop-header bb
5426    whose arguments are the result of the prolog-load (created by this
5427    function) and the result of a load that takes place in the loop (to be
5428    created by the caller to this function).
5429
5430    For the case of dr_explicit_realign_optimized:
5431    The caller to this function uses the phi-result (msq) to create the
5432    realignment code inside the loop, and sets up the missing phi argument,
5433    as follows:
5434     loop:
5435       msq = phi (msq_init, lsq)
5436       lsq = *(floor(p'));        # load in loop
5437       result = realign_load (msq, lsq, realignment_token);
5438
5439    For the case of dr_explicit_realign:
5440     loop:
5441       msq = *(floor(p));        # load in loop
5442       p' = p + (VS-1);
5443       lsq = *(floor(p'));       # load in loop
5444       result = realign_load (msq, lsq, realignment_token);
5445
5446    Input:
5447    STMT - (scalar) load stmt to be vectorized. This load accesses
5448           a memory location that may be unaligned.
5449    BSI - place where new code is to be inserted.
5450    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5451                               is used.
5452
5453    Output:
5454    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5455                        target hook, if defined.
5456    Return value - the result of the loop-header phi node.  */
5457
5458 tree
5459 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5460                         tree *realignment_token,
5461                         enum dr_alignment_support alignment_support_scheme,
5462                         tree init_addr,
5463                         struct loop **at_loop)
5464 {
5465   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5466   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5467   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5468   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5469   struct loop *loop = NULL;
5470   edge pe = NULL;
5471   tree scalar_dest = gimple_assign_lhs (stmt);
5472   tree vec_dest;
5473   gimple *inc;
5474   tree ptr;
5475   tree data_ref;
5476   basic_block new_bb;
5477   tree msq_init = NULL_TREE;
5478   tree new_temp;
5479   gphi *phi_stmt;
5480   tree msq = NULL_TREE;
5481   gimple_seq stmts = NULL;
5482   bool inv_p;
5483   bool compute_in_loop = false;
5484   bool nested_in_vect_loop = false;
5485   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5486   struct loop *loop_for_initial_load = NULL;
5487
5488   if (loop_vinfo)
5489     {
5490       loop = LOOP_VINFO_LOOP (loop_vinfo);
5491       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5492     }
5493
5494   gcc_assert (alignment_support_scheme == dr_explicit_realign
5495               || alignment_support_scheme == dr_explicit_realign_optimized);
5496
5497   /* We need to generate three things:
5498      1. the misalignment computation
5499      2. the extra vector load (for the optimized realignment scheme).
5500      3. the phi node for the two vectors from which the realignment is
5501       done (for the optimized realignment scheme).  */
5502
5503   /* 1. Determine where to generate the misalignment computation.
5504
5505      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5506      calculation will be generated by this function, outside the loop (in the
5507      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5508      caller, inside the loop.
5509
5510      Background: If the misalignment remains fixed throughout the iterations of
5511      the loop, then both realignment schemes are applicable, and also the
5512      misalignment computation can be done outside LOOP.  This is because we are
5513      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5514      are a multiple of VS (the Vector Size), and therefore the misalignment in
5515      different vectorized LOOP iterations is always the same.
5516      The problem arises only if the memory access is in an inner-loop nested
5517      inside LOOP, which is now being vectorized using outer-loop vectorization.
5518      This is the only case when the misalignment of the memory access may not
5519      remain fixed throughout the iterations of the inner-loop (as explained in
5520      detail in vect_supportable_dr_alignment).  In this case, not only is the
5521      optimized realignment scheme not applicable, but also the misalignment
5522      computation (and generation of the realignment token that is passed to
5523      REALIGN_LOAD) have to be done inside the loop.
5524
5525      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5526      or not, which in turn determines if the misalignment is computed inside
5527      the inner-loop, or outside LOOP.  */
5528
5529   if (init_addr != NULL_TREE || !loop_vinfo)
5530     {
5531       compute_in_loop = true;
5532       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5533     }
5534
5535
5536   /* 2. Determine where to generate the extra vector load.
5537
5538      For the optimized realignment scheme, instead of generating two vector
5539      loads in each iteration, we generate a single extra vector load in the
5540      preheader of the loop, and in each iteration reuse the result of the
5541      vector load from the previous iteration.  In case the memory access is in
5542      an inner-loop nested inside LOOP, which is now being vectorized using
5543      outer-loop vectorization, we need to determine whether this initial vector
5544      load should be generated at the preheader of the inner-loop, or can be
5545      generated at the preheader of LOOP.  If the memory access has no evolution
5546      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5547      to be generated inside LOOP (in the preheader of the inner-loop).  */
5548
5549   if (nested_in_vect_loop)
5550     {
5551       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5552       bool invariant_in_outerloop =
5553             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5554       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5555     }
5556   else
5557     loop_for_initial_load = loop;
5558   if (at_loop)
5559     *at_loop = loop_for_initial_load;
5560
5561   if (loop_for_initial_load)
5562     pe = loop_preheader_edge (loop_for_initial_load);
5563
5564   /* 3. For the case of the optimized realignment, create the first vector
5565       load at the loop preheader.  */
5566
5567   if (alignment_support_scheme == dr_explicit_realign_optimized)
5568     {
5569       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5570       gassign *new_stmt;
5571
5572       gcc_assert (!compute_in_loop);
5573       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5574       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5575                                       NULL_TREE, &init_addr, NULL, &inc,
5576                                       true, &inv_p);
5577       if (TREE_CODE (ptr) == SSA_NAME)
5578         new_temp = copy_ssa_name (ptr);
5579       else
5580         new_temp = make_ssa_name (TREE_TYPE (ptr));
5581       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5582       new_stmt = gimple_build_assign
5583                    (new_temp, BIT_AND_EXPR, ptr,
5584                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5585       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5586       gcc_assert (!new_bb);
5587       data_ref
5588         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5589                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5590       vect_copy_ref_info (data_ref, DR_REF (dr));
5591       new_stmt = gimple_build_assign (vec_dest, data_ref);
5592       new_temp = make_ssa_name (vec_dest, new_stmt);
5593       gimple_assign_set_lhs (new_stmt, new_temp);
5594       if (pe)
5595         {
5596           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5597           gcc_assert (!new_bb);
5598         }
5599       else
5600          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5601
5602       msq_init = gimple_assign_lhs (new_stmt);
5603     }
5604
5605   /* 4. Create realignment token using a target builtin, if available.
5606       It is done either inside the containing loop, or before LOOP (as
5607       determined above).  */
5608
5609   if (targetm.vectorize.builtin_mask_for_load)
5610     {
5611       gcall *new_stmt;
5612       tree builtin_decl;
5613
5614       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5615       if (!init_addr)
5616         {
5617           /* Generate the INIT_ADDR computation outside LOOP.  */
5618           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5619                                                             NULL_TREE);
5620           if (loop)
5621             {
5622               pe = loop_preheader_edge (loop);
5623               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5624               gcc_assert (!new_bb);
5625             }
5626           else
5627              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5628         }
5629
5630       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5631       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5632       vec_dest =
5633         vect_create_destination_var (scalar_dest,
5634                                      gimple_call_return_type (new_stmt));
5635       new_temp = make_ssa_name (vec_dest, new_stmt);
5636       gimple_call_set_lhs (new_stmt, new_temp);
5637
5638       if (compute_in_loop)
5639         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5640       else
5641         {
5642           /* Generate the misalignment computation outside LOOP.  */
5643           pe = loop_preheader_edge (loop);
5644           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5645           gcc_assert (!new_bb);
5646         }
5647
5648       *realignment_token = gimple_call_lhs (new_stmt);
5649
5650       /* The result of the CALL_EXPR to this builtin is determined from
5651          the value of the parameter and no global variables are touched
5652          which makes the builtin a "const" function.  Requiring the
5653          builtin to have the "const" attribute makes it unnecessary
5654          to call mark_call_clobbered.  */
5655       gcc_assert (TREE_READONLY (builtin_decl));
5656     }
5657
5658   if (alignment_support_scheme == dr_explicit_realign)
5659     return msq;
5660
5661   gcc_assert (!compute_in_loop);
5662   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5663
5664
5665   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5666
5667   pe = loop_preheader_edge (containing_loop);
5668   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5669   msq = make_ssa_name (vec_dest);
5670   phi_stmt = create_phi_node (msq, containing_loop->header);
5671   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5672
5673   return msq;
5674 }
5675
5676
5677 /* Function vect_grouped_load_supported.
5678
5679    COUNT is the size of the load group (the number of statements plus the
5680    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5681    only one statement, with a gap of COUNT - 1.
5682
5683    Returns true if a suitable permute exists.  */
5684
5685 bool
5686 vect_grouped_load_supported (tree vectype, bool single_element_p,
5687                              unsigned HOST_WIDE_INT count)
5688 {
5689   machine_mode mode = TYPE_MODE (vectype);
5690
5691   /* If this is single-element interleaving with an element distance
5692      that leaves unused vector loads around punt - we at least create
5693      very sub-optimal code in that case (and blow up memory,
5694      see PR65518).  */
5695   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5696     {
5697       if (dump_enabled_p ())
5698         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5699                          "single-element interleaving not supported "
5700                          "for not adjacent vector loads\n");
5701       return false;
5702     }
5703
5704   /* vect_permute_load_chain requires the group size to be equal to 3 or
5705      be a power of two.  */
5706   if (count != 3 && exact_log2 (count) == -1)
5707     {
5708       if (dump_enabled_p ())
5709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710                          "the size of the group of accesses"
5711                          " is not a power of 2 or not equal to 3\n");
5712       return false;
5713     }
5714
5715   /* Check that the permutation is supported.  */
5716   if (VECTOR_MODE_P (mode))
5717     {
5718       unsigned int i, j;
5719       if (count == 3)
5720         {
5721           unsigned int nelt;
5722           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5723             {
5724               if (dump_enabled_p ())
5725                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726                                  "cannot handle groups of 3 loads for"
5727                                  " variable-length vectors\n");
5728               return false;
5729             }
5730
5731           vec_perm_builder sel (nelt, nelt, 1);
5732           sel.quick_grow (nelt);
5733           vec_perm_indices indices;
5734           unsigned int k;
5735           for (k = 0; k < 3; k++)
5736             {
5737               for (i = 0; i < nelt; i++)
5738                 if (3 * i + k < 2 * nelt)
5739                   sel[i] = 3 * i + k;
5740                 else
5741                   sel[i] = 0;
5742               indices.new_vector (sel, 2, nelt);
5743               if (!can_vec_perm_const_p (mode, indices))
5744                 {
5745                   if (dump_enabled_p ())
5746                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747                                      "shuffle of 3 loads is not supported by"
5748                                      " target\n");
5749                   return false;
5750                 }
5751               for (i = 0, j = 0; i < nelt; i++)
5752                 if (3 * i + k < 2 * nelt)
5753                   sel[i] = i;
5754                 else
5755                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5756               indices.new_vector (sel, 2, nelt);
5757               if (!can_vec_perm_const_p (mode, indices))
5758                 {
5759                   if (dump_enabled_p ())
5760                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5761                                      "shuffle of 3 loads is not supported by"
5762                                      " target\n");
5763                   return false;
5764                 }
5765             }
5766           return true;
5767         }
5768       else
5769         {
5770           /* If length is not equal to 3 then only power of 2 is supported.  */
5771           gcc_assert (pow2p_hwi (count));
5772           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5773
5774           /* The encoding has a single stepped pattern.  */
5775           vec_perm_builder sel (nelt, 1, 3);
5776           sel.quick_grow (3);
5777           for (i = 0; i < 3; i++)
5778             sel[i] = i * 2;
5779           vec_perm_indices indices (sel, 2, nelt);
5780           if (can_vec_perm_const_p (mode, indices))
5781             {
5782               for (i = 0; i < 3; i++)
5783                 sel[i] = i * 2 + 1;
5784               indices.new_vector (sel, 2, nelt);
5785               if (can_vec_perm_const_p (mode, indices))
5786                 return true;
5787             }
5788         }
5789     }
5790
5791   if (dump_enabled_p ())
5792     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5793                      "extract even/odd not supported by target\n");
5794   return false;
5795 }
5796
5797 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5798    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5799
5800 bool
5801 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5802                            bool masked_p)
5803 {
5804   if (masked_p)
5805     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5806                                          vec_mask_load_lanes_optab,
5807                                          vectype, count);
5808   else
5809     return vect_lanes_optab_supported_p ("vec_load_lanes",
5810                                          vec_load_lanes_optab,
5811                                          vectype, count);
5812 }
5813
5814 /* Function vect_permute_load_chain.
5815
5816    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5817    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5818    the input data correctly.  Return the final references for loads in
5819    RESULT_CHAIN.
5820
5821    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5822    The input is 4 vectors each containing 8 elements. We assign a number to each
5823    element, the input sequence is:
5824
5825    1st vec:   0  1  2  3  4  5  6  7
5826    2nd vec:   8  9 10 11 12 13 14 15
5827    3rd vec:  16 17 18 19 20 21 22 23
5828    4th vec:  24 25 26 27 28 29 30 31
5829
5830    The output sequence should be:
5831
5832    1st vec:  0 4  8 12 16 20 24 28
5833    2nd vec:  1 5  9 13 17 21 25 29
5834    3rd vec:  2 6 10 14 18 22 26 30
5835    4th vec:  3 7 11 15 19 23 27 31
5836
5837    i.e., the first output vector should contain the first elements of each
5838    interleaving group, etc.
5839
5840    We use extract_even/odd instructions to create such output.  The input of
5841    each extract_even/odd operation is two vectors
5842    1st vec    2nd vec
5843    0 1 2 3    4 5 6 7
5844
5845    and the output is the vector of extracted even/odd elements.  The output of
5846    extract_even will be:   0 2 4 6
5847    and of extract_odd:     1 3 5 7
5848
5849
5850    The permutation is done in log LENGTH stages.  In each stage extract_even
5851    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5852    their order.  In our example,
5853
5854    E1: extract_even (1st vec, 2nd vec)
5855    E2: extract_odd (1st vec, 2nd vec)
5856    E3: extract_even (3rd vec, 4th vec)
5857    E4: extract_odd (3rd vec, 4th vec)
5858
5859    The output for the first stage will be:
5860
5861    E1:  0  2  4  6  8 10 12 14
5862    E2:  1  3  5  7  9 11 13 15
5863    E3: 16 18 20 22 24 26 28 30
5864    E4: 17 19 21 23 25 27 29 31
5865
5866    In order to proceed and create the correct sequence for the next stage (or
5867    for the correct output, if the second stage is the last one, as in our
5868    example), we first put the output of extract_even operation and then the
5869    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5870    The input for the second stage is:
5871
5872    1st vec (E1):  0  2  4  6  8 10 12 14
5873    2nd vec (E3): 16 18 20 22 24 26 28 30
5874    3rd vec (E2):  1  3  5  7  9 11 13 15
5875    4th vec (E4): 17 19 21 23 25 27 29 31
5876
5877    The output of the second stage:
5878
5879    E1: 0 4  8 12 16 20 24 28
5880    E2: 2 6 10 14 18 22 26 30
5881    E3: 1 5  9 13 17 21 25 29
5882    E4: 3 7 11 15 19 23 27 31
5883
5884    And RESULT_CHAIN after reordering:
5885
5886    1st vec (E1):  0 4  8 12 16 20 24 28
5887    2nd vec (E3):  1 5  9 13 17 21 25 29
5888    3rd vec (E2):  2 6 10 14 18 22 26 30
5889    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5890
5891 static void
5892 vect_permute_load_chain (vec<tree> dr_chain,
5893                          unsigned int length,
5894                          gimple *stmt,
5895                          gimple_stmt_iterator *gsi,
5896                          vec<tree> *result_chain)
5897 {
5898   tree data_ref, first_vect, second_vect;
5899   tree perm_mask_even, perm_mask_odd;
5900   tree perm3_mask_low, perm3_mask_high;
5901   gimple *perm_stmt;
5902   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5903   unsigned int i, j, log_length = exact_log2 (length);
5904
5905   result_chain->quick_grow (length);
5906   memcpy (result_chain->address (), dr_chain.address (),
5907           length * sizeof (tree));
5908
5909   if (length == 3)
5910     {
5911       /* vect_grouped_load_supported ensures that this is constant.  */
5912       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5913       unsigned int k;
5914
5915       vec_perm_builder sel (nelt, nelt, 1);
5916       sel.quick_grow (nelt);
5917       vec_perm_indices indices;
5918       for (k = 0; k < 3; k++)
5919         {
5920           for (i = 0; i < nelt; i++)
5921             if (3 * i + k < 2 * nelt)
5922               sel[i] = 3 * i + k;
5923             else
5924               sel[i] = 0;
5925           indices.new_vector (sel, 2, nelt);
5926           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5927
5928           for (i = 0, j = 0; i < nelt; i++)
5929             if (3 * i + k < 2 * nelt)
5930               sel[i] = i;
5931             else
5932               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5933           indices.new_vector (sel, 2, nelt);
5934           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5935
5936           first_vect = dr_chain[0];
5937           second_vect = dr_chain[1];
5938
5939           /* Create interleaving stmt (low part of):
5940              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5941                                                              ...}>  */
5942           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5943           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5944                                            second_vect, perm3_mask_low);
5945           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5946
5947           /* Create interleaving stmt (high part of):
5948              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5949                                                               ...}>  */
5950           first_vect = data_ref;
5951           second_vect = dr_chain[2];
5952           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5953           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5954                                            second_vect, perm3_mask_high);
5955           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956           (*result_chain)[k] = data_ref;
5957         }
5958     }
5959   else
5960     {
5961       /* If length is not equal to 3 then only power of 2 is supported.  */
5962       gcc_assert (pow2p_hwi (length));
5963
5964       /* The encoding has a single stepped pattern.  */
5965       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5966       vec_perm_builder sel (nelt, 1, 3);
5967       sel.quick_grow (3);
5968       for (i = 0; i < 3; ++i)
5969         sel[i] = i * 2;
5970       vec_perm_indices indices (sel, 2, nelt);
5971       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5972
5973       for (i = 0; i < 3; ++i)
5974         sel[i] = i * 2 + 1;
5975       indices.new_vector (sel, 2, nelt);
5976       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5977
5978       for (i = 0; i < log_length; i++)
5979         {
5980           for (j = 0; j < length; j += 2)
5981             {
5982               first_vect = dr_chain[j];
5983               second_vect = dr_chain[j+1];
5984
5985               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5986               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5987               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5988                                                first_vect, second_vect,
5989                                                perm_mask_even);
5990               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5991               (*result_chain)[j/2] = data_ref;
5992
5993               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5994               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5995               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5996                                                first_vect, second_vect,
5997                                                perm_mask_odd);
5998               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999               (*result_chain)[j/2+length/2] = data_ref;
6000             }
6001           memcpy (dr_chain.address (), result_chain->address (),
6002                   length * sizeof (tree));
6003         }
6004     }
6005 }
6006
6007 /* Function vect_shift_permute_load_chain.
6008
6009    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6010    sequence of stmts to reorder the input data accordingly.
6011    Return the final references for loads in RESULT_CHAIN.
6012    Return true if successed, false otherwise.
6013
6014    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6015    The input is 3 vectors each containing 8 elements.  We assign a
6016    number to each element, the input sequence is:
6017
6018    1st vec:   0  1  2  3  4  5  6  7
6019    2nd vec:   8  9 10 11 12 13 14 15
6020    3rd vec:  16 17 18 19 20 21 22 23
6021
6022    The output sequence should be:
6023
6024    1st vec:  0 3 6  9 12 15 18 21
6025    2nd vec:  1 4 7 10 13 16 19 22
6026    3rd vec:  2 5 8 11 14 17 20 23
6027
6028    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6029
6030    First we shuffle all 3 vectors to get correct elements order:
6031
6032    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6033    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6034    3rd vec:  (16 19 22) (17 20 23) (18 21)
6035
6036    Next we unite and shift vector 3 times:
6037
6038    1st step:
6039      shift right by 6 the concatenation of:
6040      "1st vec" and  "2nd vec"
6041        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6042      "2nd vec" and  "3rd vec"
6043        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6044      "3rd vec" and  "1st vec"
6045        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6046                              | New vectors                   |
6047
6048      So that now new vectors are:
6049
6050      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6051      2nd vec:  (10 13) (16 19 22) (17 20 23)
6052      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6053
6054    2nd step:
6055      shift right by 5 the concatenation of:
6056      "1st vec" and  "3rd vec"
6057        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6058      "2nd vec" and  "1st vec"
6059        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6060      "3rd vec" and  "2nd vec"
6061        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6062                           | New vectors                   |
6063
6064      So that now new vectors are:
6065
6066      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6067      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6068      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6069
6070    3rd step:
6071      shift right by 5 the concatenation of:
6072      "1st vec" and  "1st vec"
6073        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6074      shift right by 3 the concatenation of:
6075      "2nd vec" and  "2nd vec"
6076                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6077                           | New vectors                   |
6078
6079      So that now all vectors are READY:
6080      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6081      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6082      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6083
6084    This algorithm is faster than one in vect_permute_load_chain if:
6085      1.  "shift of a concatination" is faster than general permutation.
6086          This is usually so.
6087      2.  The TARGET machine can't execute vector instructions in parallel.
6088          This is because each step of the algorithm depends on previous.
6089          The algorithm in vect_permute_load_chain is much more parallel.
6090
6091    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6092 */
6093
6094 static bool
6095 vect_shift_permute_load_chain (vec<tree> dr_chain,
6096                                unsigned int length,
6097                                gimple *stmt,
6098                                gimple_stmt_iterator *gsi,
6099                                vec<tree> *result_chain)
6100 {
6101   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6102   tree perm2_mask1, perm2_mask2, perm3_mask;
6103   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6104   gimple *perm_stmt;
6105
6106   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6107   unsigned int i;
6108   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6109   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6110
6111   unsigned HOST_WIDE_INT nelt, vf;
6112   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6113       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6114     /* Not supported for variable-length vectors.  */
6115     return false;
6116
6117   vec_perm_builder sel (nelt, nelt, 1);
6118   sel.quick_grow (nelt);
6119
6120   result_chain->quick_grow (length);
6121   memcpy (result_chain->address (), dr_chain.address (),
6122           length * sizeof (tree));
6123
6124   if (pow2p_hwi (length) && vf > 4)
6125     {
6126       unsigned int j, log_length = exact_log2 (length);
6127       for (i = 0; i < nelt / 2; ++i)
6128         sel[i] = i * 2;
6129       for (i = 0; i < nelt / 2; ++i)
6130         sel[nelt / 2 + i] = i * 2 + 1;
6131       vec_perm_indices indices (sel, 2, nelt);
6132       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6133         {
6134           if (dump_enabled_p ())
6135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6136                              "shuffle of 2 fields structure is not \
6137                               supported by target\n");
6138           return false;
6139         }
6140       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6141
6142       for (i = 0; i < nelt / 2; ++i)
6143         sel[i] = i * 2 + 1;
6144       for (i = 0; i < nelt / 2; ++i)
6145         sel[nelt / 2 + i] = i * 2;
6146       indices.new_vector (sel, 2, nelt);
6147       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6148         {
6149           if (dump_enabled_p ())
6150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151                              "shuffle of 2 fields structure is not \
6152                               supported by target\n");
6153           return false;
6154         }
6155       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6156
6157       /* Generating permutation constant to shift all elements.
6158          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6159       for (i = 0; i < nelt; i++)
6160         sel[i] = nelt / 2 + i;
6161       indices.new_vector (sel, 2, nelt);
6162       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6163         {
6164           if (dump_enabled_p ())
6165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166                              "shift permutation is not supported by target\n");
6167           return false;
6168         }
6169       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6170
6171       /* Generating permutation constant to select vector from 2.
6172          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6173       for (i = 0; i < nelt / 2; i++)
6174         sel[i] = i;
6175       for (i = nelt / 2; i < nelt; i++)
6176         sel[i] = nelt + i;
6177       indices.new_vector (sel, 2, nelt);
6178       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6179         {
6180           if (dump_enabled_p ())
6181             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182                              "select is not supported by target\n");
6183           return false;
6184         }
6185       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6186
6187       for (i = 0; i < log_length; i++)
6188         {
6189           for (j = 0; j < length; j += 2)
6190             {
6191               first_vect = dr_chain[j];
6192               second_vect = dr_chain[j + 1];
6193
6194               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6195               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6196                                                first_vect, first_vect,
6197                                                perm2_mask1);
6198               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6199               vect[0] = data_ref;
6200
6201               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6202               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6203                                                second_vect, second_vect,
6204                                                perm2_mask2);
6205               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6206               vect[1] = data_ref;
6207
6208               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6209               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6210                                                vect[0], vect[1], shift1_mask);
6211               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6212               (*result_chain)[j/2 + length/2] = data_ref;
6213
6214               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6215               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6216                                                vect[0], vect[1], select_mask);
6217               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6218               (*result_chain)[j/2] = data_ref;
6219             }
6220           memcpy (dr_chain.address (), result_chain->address (),
6221                   length * sizeof (tree));
6222         }
6223       return true;
6224     }
6225   if (length == 3 && vf > 2)
6226     {
6227       unsigned int k = 0, l = 0;
6228
6229       /* Generating permutation constant to get all elements in rigth order.
6230          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6231       for (i = 0; i < nelt; i++)
6232         {
6233           if (3 * k + (l % 3) >= nelt)
6234             {
6235               k = 0;
6236               l += (3 - (nelt % 3));
6237             }
6238           sel[i] = 3 * k + (l % 3);
6239           k++;
6240         }
6241       vec_perm_indices indices (sel, 2, nelt);
6242       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6243         {
6244           if (dump_enabled_p ())
6245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246                              "shuffle of 3 fields structure is not \
6247                               supported by target\n");
6248           return false;
6249         }
6250       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6251
6252       /* Generating permutation constant to shift all elements.
6253          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6254       for (i = 0; i < nelt; i++)
6255         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6256       indices.new_vector (sel, 2, nelt);
6257       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6258         {
6259           if (dump_enabled_p ())
6260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261                              "shift permutation is not supported by target\n");
6262           return false;
6263         }
6264       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6265
6266       /* Generating permutation constant to shift all elements.
6267          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6268       for (i = 0; i < nelt; i++)
6269         sel[i] = 2 * (nelt / 3) + 1 + i;
6270       indices.new_vector (sel, 2, nelt);
6271       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272         {
6273           if (dump_enabled_p ())
6274             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275                              "shift permutation is not supported by target\n");
6276           return false;
6277         }
6278       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6279
6280       /* Generating permutation constant to shift all elements.
6281          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6282       for (i = 0; i < nelt; i++)
6283         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6284       indices.new_vector (sel, 2, nelt);
6285       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6286         {
6287           if (dump_enabled_p ())
6288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                              "shift permutation is not supported by target\n");
6290           return false;
6291         }
6292       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6293
6294       /* Generating permutation constant to shift all elements.
6295          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6296       for (i = 0; i < nelt; i++)
6297         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6298       indices.new_vector (sel, 2, nelt);
6299       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6300         {
6301           if (dump_enabled_p ())
6302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303                              "shift permutation is not supported by target\n");
6304           return false;
6305         }
6306       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6307
6308       for (k = 0; k < 3; k++)
6309         {
6310           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6311           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6312                                            dr_chain[k], dr_chain[k],
6313                                            perm3_mask);
6314           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6315           vect[k] = data_ref;
6316         }
6317
6318       for (k = 0; k < 3; k++)
6319         {
6320           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6321           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322                                            vect[k % 3], vect[(k + 1) % 3],
6323                                            shift1_mask);
6324           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6325           vect_shift[k] = data_ref;
6326         }
6327
6328       for (k = 0; k < 3; k++)
6329         {
6330           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6331           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6332                                            vect_shift[(4 - k) % 3],
6333                                            vect_shift[(3 - k) % 3],
6334                                            shift2_mask);
6335           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6336           vect[k] = data_ref;
6337         }
6338
6339       (*result_chain)[3 - (nelt % 3)] = vect[2];
6340
6341       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6342       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6343                                        vect[0], shift3_mask);
6344       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6345       (*result_chain)[nelt % 3] = data_ref;
6346
6347       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6348       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6349                                        vect[1], shift4_mask);
6350       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6351       (*result_chain)[0] = data_ref;
6352       return true;
6353     }
6354   return false;
6355 }
6356
6357 /* Function vect_transform_grouped_load.
6358
6359    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6360    to perform their permutation and ascribe the result vectorized statements to
6361    the scalar statements.
6362 */
6363
6364 void
6365 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6366                              gimple_stmt_iterator *gsi)
6367 {
6368   machine_mode mode;
6369   vec<tree> result_chain = vNULL;
6370
6371   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6372      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6373      vectors, that are ready for vector computation.  */
6374   result_chain.create (size);
6375
6376   /* If reassociation width for vector type is 2 or greater target machine can
6377      execute 2 or more vector instructions in parallel.  Otherwise try to
6378      get chain for loads group using vect_shift_permute_load_chain.  */
6379   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6380   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6381       || pow2p_hwi (size)
6382       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6383                                          gsi, &result_chain))
6384     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6385   vect_record_grouped_load_vectors (stmt, result_chain);
6386   result_chain.release ();
6387 }
6388
6389 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6390    generated as part of the vectorization of STMT.  Assign the statement
6391    for each vector to the associated scalar statement.  */
6392
6393 void
6394 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6395 {
6396   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6397   gimple *next_stmt, *new_stmt;
6398   unsigned int i, gap_count;
6399   tree tmp_data_ref;
6400
6401   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6402      Since we scan the chain starting from it's first node, their order
6403      corresponds the order of data-refs in RESULT_CHAIN.  */
6404   next_stmt = first_stmt;
6405   gap_count = 1;
6406   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6407     {
6408       if (!next_stmt)
6409         break;
6410
6411       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6412        code elimination pass later.  No need to check for the first stmt in
6413        the group, since it always exists.
6414        GROUP_GAP is the number of steps in elements from the previous
6415        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6416        correspond to the gaps.  */
6417       if (next_stmt != first_stmt
6418           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6419       {
6420         gap_count++;
6421         continue;
6422       }
6423
6424       while (next_stmt)
6425         {
6426           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6427           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6428              copies, and we put the new vector statement in the first available
6429              RELATED_STMT.  */
6430           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6431             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6432           else
6433             {
6434               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6435                 {
6436                   gimple *prev_stmt =
6437                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6438                   gimple *rel_stmt =
6439                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6440                   while (rel_stmt)
6441                     {
6442                       prev_stmt = rel_stmt;
6443                       rel_stmt =
6444                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6445                     }
6446
6447                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6448                     new_stmt;
6449                 }
6450             }
6451
6452           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6453           gap_count = 1;
6454           /* If NEXT_STMT accesses the same DR as the previous statement,
6455              put the same TMP_DATA_REF as its vectorized statement; otherwise
6456              get the next data-ref from RESULT_CHAIN.  */
6457           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6458             break;
6459         }
6460     }
6461 }
6462
6463 /* Function vect_force_dr_alignment_p.
6464
6465    Returns whether the alignment of a DECL can be forced to be aligned
6466    on ALIGNMENT bit boundary.  */
6467
6468 bool
6469 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6470 {
6471   if (!VAR_P (decl))
6472     return false;
6473
6474   if (decl_in_symtab_p (decl)
6475       && !symtab_node::get (decl)->can_increase_alignment_p ())
6476     return false;
6477
6478   if (TREE_STATIC (decl))
6479     return (alignment <= MAX_OFILE_ALIGNMENT);
6480   else
6481     return (alignment <= MAX_STACK_ALIGNMENT);
6482 }
6483
6484
6485 /* Return whether the data reference DR is supported with respect to its
6486    alignment.
6487    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6488    it is aligned, i.e., check if it is possible to vectorize it with different
6489    alignment.  */
6490
6491 enum dr_alignment_support
6492 vect_supportable_dr_alignment (struct data_reference *dr,
6493                                bool check_aligned_accesses)
6494 {
6495   gimple *stmt = DR_STMT (dr);
6496   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6497   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6498   machine_mode mode = TYPE_MODE (vectype);
6499   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6500   struct loop *vect_loop = NULL;
6501   bool nested_in_vect_loop = false;
6502
6503   if (aligned_access_p (dr) && !check_aligned_accesses)
6504     return dr_aligned;
6505
6506   /* For now assume all conditional loads/stores support unaligned
6507      access without any special code.  */
6508   if (is_gimple_call (stmt)
6509       && gimple_call_internal_p (stmt)
6510       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6511           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6512     return dr_unaligned_supported;
6513
6514   if (loop_vinfo)
6515     {
6516       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6517       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6518     }
6519
6520   /* Possibly unaligned access.  */
6521
6522   /* We can choose between using the implicit realignment scheme (generating
6523      a misaligned_move stmt) and the explicit realignment scheme (generating
6524      aligned loads with a REALIGN_LOAD).  There are two variants to the
6525      explicit realignment scheme: optimized, and unoptimized.
6526      We can optimize the realignment only if the step between consecutive
6527      vector loads is equal to the vector size.  Since the vector memory
6528      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6529      is guaranteed that the misalignment amount remains the same throughout the
6530      execution of the vectorized loop.  Therefore, we can create the
6531      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6532      at the loop preheader.
6533
6534      However, in the case of outer-loop vectorization, when vectorizing a
6535      memory access in the inner-loop nested within the LOOP that is now being
6536      vectorized, while it is guaranteed that the misalignment of the
6537      vectorized memory access will remain the same in different outer-loop
6538      iterations, it is *not* guaranteed that is will remain the same throughout
6539      the execution of the inner-loop.  This is because the inner-loop advances
6540      with the original scalar step (and not in steps of VS).  If the inner-loop
6541      step happens to be a multiple of VS, then the misalignment remains fixed
6542      and we can use the optimized realignment scheme.  For example:
6543
6544       for (i=0; i<N; i++)
6545         for (j=0; j<M; j++)
6546           s += a[i+j];
6547
6548      When vectorizing the i-loop in the above example, the step between
6549      consecutive vector loads is 1, and so the misalignment does not remain
6550      fixed across the execution of the inner-loop, and the realignment cannot
6551      be optimized (as illustrated in the following pseudo vectorized loop):
6552
6553       for (i=0; i<N; i+=4)
6554         for (j=0; j<M; j++){
6555           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6556                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6557                          // (assuming that we start from an aligned address).
6558           }
6559
6560      We therefore have to use the unoptimized realignment scheme:
6561
6562       for (i=0; i<N; i+=4)
6563           for (j=k; j<M; j+=4)
6564           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6565                            // that the misalignment of the initial address is
6566                            // 0).
6567
6568      The loop can then be vectorized as follows:
6569
6570       for (k=0; k<4; k++){
6571         rt = get_realignment_token (&vp[k]);
6572         for (i=0; i<N; i+=4){
6573           v1 = vp[i+k];
6574           for (j=k; j<M; j+=4){
6575             v2 = vp[i+j+VS-1];
6576             va = REALIGN_LOAD <v1,v2,rt>;
6577             vs += va;
6578             v1 = v2;
6579           }
6580         }
6581     } */
6582
6583   if (DR_IS_READ (dr))
6584     {
6585       bool is_packed = false;
6586       tree type = (TREE_TYPE (DR_REF (dr)));
6587
6588       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6589           && (!targetm.vectorize.builtin_mask_for_load
6590               || targetm.vectorize.builtin_mask_for_load ()))
6591         {
6592           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6593
6594           /* If we are doing SLP then the accesses need not have the
6595              same alignment, instead it depends on the SLP group size.  */
6596           if (loop_vinfo
6597               && STMT_SLP_TYPE (stmt_info)
6598               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6599                               * GROUP_SIZE (vinfo_for_stmt
6600                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6601                               TYPE_VECTOR_SUBPARTS (vectype)))
6602             ;
6603           else if (!loop_vinfo
6604                    || (nested_in_vect_loop
6605                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6606                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6607             return dr_explicit_realign;
6608           else
6609             return dr_explicit_realign_optimized;
6610         }
6611       if (!known_alignment_for_access_p (dr))
6612         is_packed = not_size_aligned (DR_REF (dr));
6613
6614       if (targetm.vectorize.support_vector_misalignment
6615             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6616         /* Can't software pipeline the loads, but can at least do them.  */
6617         return dr_unaligned_supported;
6618     }
6619   else
6620     {
6621       bool is_packed = false;
6622       tree type = (TREE_TYPE (DR_REF (dr)));
6623
6624       if (!known_alignment_for_access_p (dr))
6625         is_packed = not_size_aligned (DR_REF (dr));
6626
6627      if (targetm.vectorize.support_vector_misalignment
6628            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6629        return dr_unaligned_supported;
6630     }
6631
6632   /* Unsupported.  */
6633   return dr_unaligned_unsupported;
6634 }