gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 217 }
 218
 219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 221    distances.  These distances are conservatively correct but they don't
 222    reflect a guaranteed dependence.
 223
 224    Return true if this function does all the work necessary to avoid
 225    an alias or false if the caller should use the dependence distances
 226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 227    the depth of the loop described by LOOP_VINFO and the other arguments
 228    are as for vect_analyze_data_ref_dependence.  */
 229
 230 static bool
 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 232                                        loop_vec_info loop_vinfo,
 233                                        int loop_depth, unsigned int *max_vf)
 234 {
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   lambda_vector dist_v;
 237   unsigned int i;
 238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 239     {
 240       int dist = dist_v[loop_depth];
 241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 242         {
 243           /* If the user asserted safelen >= DIST consecutive iterations
 244              can be executed concurrently, assume independence.
 245
 246              ??? An alternative would be to add the alias check even
 247              in this case, and vectorize the fallback loop with the
 248              maximum VF set to safelen.  However, if the user has
 249              explicitly given a length, it's less likely that that
 250              would be a win.  */
 251           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 252             {
 253               if ((unsigned int) loop->safelen < *max_vf)
 254                 *max_vf = loop->safelen;
 255               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 256               continue;
 257             }
 258
 259           /* For dependence distances of 2 or more, we have the option
 260              of limiting VF or checking for an alias at runtime.
 261              Prefer to check at runtime if we can, to avoid limiting
 262              the VF unnecessarily when the bases are in fact independent.
 263
 264              Note that the alias checks will be removed if the VF ends up
 265              being small enough.  */
 266           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 267         }
 268     }
 269   return true;
 270 }
 271
 272
 273 /* Function vect_analyze_data_ref_dependence.
 274
 275    Return TRUE if there (might) exist a dependence between a memory-reference
 276    DRA and a memory-reference DRB.  When versioning for alias may check a
 277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 278    the data dependence.  */
 279
 280 static bool
 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 282                                   loop_vec_info loop_vinfo,
 283                                   unsigned int *max_vf)
 284 {
 285   unsigned int i;
 286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 287   struct data_reference *dra = DDR_A (ddr);
 288   struct data_reference *drb = DDR_B (ddr);
 289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 291   lambda_vector dist_v;
 292   unsigned int loop_depth;
 293
 294   /* In loop analysis all data references should be vectorizable.  */
 295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 297     gcc_unreachable ();
 298
 299   /* Independent data accesses.  */
 300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 301     return false;
 302
 303   if (dra == drb
 304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 305     return false;
 306
 307   /* We do not have to consider dependences between accesses that belong
 308      to the same group, unless the stride could be smaller than the
 309      group size.  */
 310   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 311       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
 312       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 313     return false;
 314
 315   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 316      least two scalar iterations, there is always also a true dependence.
 317      As the vectorizer does not re-order loads and stores we can ignore
 318      the anti-dependence if TBAA can disambiguate both DRs similar to the
 319      case with known negative distance anti-dependences (positive
 320      distance anti-dependences would violate TBAA constraints).  */
 321   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 322        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 323       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 324                                  get_alias_set (DR_REF (drb))))
 325     return false;
 326
 327   /* Unknown data dependence.  */
 328   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 329     {
 330       /* If user asserted safelen consecutive iterations can be
 331          executed concurrently, assume independence.  */
 332       if (loop->safelen >= 2)
 333         {
 334           if ((unsigned int) loop->safelen < *max_vf)
 335             *max_vf = loop->safelen;
 336           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 337           return false;
 338         }
 339
 340       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 341           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 342         {
 343           if (dump_enabled_p ())
 344             {
 345               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 346                                "versioning for alias not supported for: "
 347                                "can't determine dependence between ");
 348               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 349                                  DR_REF (dra));
 350               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 351               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 352                                  DR_REF (drb));
 353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 354             }
 355           return true;
 356         }
 357
 358       if (dump_enabled_p ())
 359         {
 360           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 361                            "versioning for alias required: "
 362                            "can't determine dependence between ");
 363           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 364                              DR_REF (dra));
 365           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 366           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 367                              DR_REF (drb));
 368           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 369         }
 370
 371       /* Add to list of ddrs that need to be tested at run-time.  */
 372       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 373     }
 374
 375   /* Known data dependence.  */
 376   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 377     {
 378       /* If user asserted safelen consecutive iterations can be
 379          executed concurrently, assume independence.  */
 380       if (loop->safelen >= 2)
 381         {
 382           if ((unsigned int) loop->safelen < *max_vf)
 383             *max_vf = loop->safelen;
 384           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 385           return false;
 386         }
 387
 388       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 389           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 390         {
 391           if (dump_enabled_p ())
 392             {
 393               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                "versioning for alias not supported for: "
 395                                "bad dist vector for ");
 396               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 397                                  DR_REF (dra));
 398               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 399               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 400                                  DR_REF (drb));
 401               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 402             }
 403           return true;
 404         }
 405
 406       if (dump_enabled_p ())
 407         {
 408           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 409                            "versioning for alias required: "
 410                            "bad dist vector for ");
 411           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 412           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 413           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 414           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 415         }
 416       /* Add to list of ddrs that need to be tested at run-time.  */
 417       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 418     }
 419
 420   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 421
 422   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 423       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 424                                                 loop_depth, max_vf))
 425     return false;
 426
 427   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 428     {
 429       int dist = dist_v[loop_depth];
 430
 431       if (dump_enabled_p ())
 432         dump_printf_loc (MSG_NOTE, vect_location,
 433                          "dependence distance  = %d.\n", dist);
 434
 435       if (dist == 0)
 436         {
 437           if (dump_enabled_p ())
 438             {
 439               dump_printf_loc (MSG_NOTE, vect_location,
 440                                "dependence distance == 0 between ");
 441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 442               dump_printf (MSG_NOTE, " and ");
 443               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 444               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 445             }
 446
 447           /* When we perform grouped accesses and perform implicit CSE
 448              by detecting equal accesses and doing disambiguation with
 449              runtime alias tests like for
 450                 .. = a[i];
 451                 .. = a[i+1];
 452                 a[i] = ..;
 453                 a[i+1] = ..;
 454                 *p = ..;
 455                 .. = a[i];
 456                 .. = a[i+1];
 457              where we will end up loading { a[i], a[i+1] } once, make
 458              sure that inserting group loads before the first load and
 459              stores after the last store will do the right thing.
 460              Similar for groups like
 461                 a[i] = ...;
 462                 ... = a[i];
 463                 a[i+1] = ...;
 464              where loads from the group interleave with the store.  */
 465           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 466             {
 467               if (dump_enabled_p ())
 468                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 469                                  "READ_WRITE dependence in interleaving.\n");
 470               return true;
 471             }
 472
 473           if (loop->safelen < 2)
 474             {
 475               tree indicator = dr_zero_step_indicator (dra);
 476               if (TREE_CODE (indicator) != INTEGER_CST)
 477                 vect_check_nonzero_value (loop_vinfo, indicator);
 478               else if (integer_zerop (indicator))
 479                 {
 480                   if (dump_enabled_p ())
 481                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 482                                  "access also has a zero step\n");
 483                   return true;
 484                 }
 485             }
 486           continue;
 487         }
 488
 489       if (dist > 0 && DDR_REVERSED_P (ddr))
 490         {
 491           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 492              reversed (to make distance vector positive), and the actual
 493              distance is negative.  */
 494           if (dump_enabled_p ())
 495             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 496                              "dependence distance negative.\n");
 497           /* Record a negative dependence distance to later limit the
 498              amount of stmt copying / unrolling we can perform.
 499              Only need to handle read-after-write dependence.  */
 500           if (DR_IS_READ (drb)
 501               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 502                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 503             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 504           continue;
 505         }
 506
 507       unsigned int abs_dist = abs (dist);
 508       if (abs_dist >= 2 && abs_dist < *max_vf)
 509         {
 510           /* The dependence distance requires reduction of the maximal
 511              vectorization factor.  */
 512           *max_vf = abs (dist);
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "adjusting maximal vectorization factor to %i\n",
 516                              *max_vf);
 517         }
 518
 519       if (abs_dist >= *max_vf)
 520         {
 521           /* Dependence distance does not create dependence, as far as
 522              vectorization is concerned, in this case.  */
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "dependence distance >= VF.\n");
 526           continue;
 527         }
 528
 529       if (dump_enabled_p ())
 530         {
 531           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 532                        "not vectorized, possible dependence "
 533                        "between data-refs ");
 534           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 535           dump_printf (MSG_NOTE,  " and ");
 536           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 537           dump_printf (MSG_NOTE,  "\n");
 538         }
 539
 540       return true;
 541     }
 542
 543   return false;
 544 }
 545
 546 /* Function vect_analyze_data_ref_dependences.
 547
 548    Examine all the data references in the loop, and make sure there do not
 549    exist any data dependences between them.  Set *MAX_VF according to
 550    the maximum vectorization factor the data dependences allow.  */
 551
 552 bool
 553 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 554                                    unsigned int *max_vf)
 555 {
 556   unsigned int i;
 557   struct data_dependence_relation *ddr;
 558
 559   if (dump_enabled_p ())
 560     dump_printf_loc (MSG_NOTE, vect_location,
 561                      "=== vect_analyze_data_ref_dependences ===\n");
 562
 563   LOOP_VINFO_DDRS (loop_vinfo)
 564     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 565              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 566   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 567   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 568   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 569                                 &LOOP_VINFO_DDRS (loop_vinfo),
 570                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 571     return false;
 572
 573   /* For epilogues we either have no aliases or alias versioning
 574      was applied to original loop.  Therefore we may just get max_vf
 575      using VF of original loop.  */
 576   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 577     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 578   else
 579     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 580       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 581         return false;
 582
 583   return true;
 584 }
 585
 586
 587 /* Function vect_slp_analyze_data_ref_dependence.
 588
 589    Return TRUE if there (might) exist a dependence between a memory-reference
 590    DRA and a memory-reference DRB.  When versioning for alias may check a
 591    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 592    the data dependence.  */
 593
 594 static bool
 595 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 596 {
 597   struct data_reference *dra = DDR_A (ddr);
 598   struct data_reference *drb = DDR_B (ddr);
 599
 600   /* We need to check dependences of statements marked as unvectorizable
 601      as well, they still can prohibit vectorization.  */
 602
 603   /* Independent data accesses.  */
 604   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 605     return false;
 606
 607   if (dra == drb)
 608     return false;
 609
 610   /* Read-read is OK.  */
 611   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 612     return false;
 613
 614   /* If dra and drb are part of the same interleaving chain consider
 615      them independent.  */
 616   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 617       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 618           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 619     return false;
 620
 621   /* Unknown data dependence.  */
 622   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 623     {
 624       if  (dump_enabled_p ())
 625         {
 626           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                            "can't determine dependence between ");
 628           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 629           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 630           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 631           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 632         }
 633     }
 634   else if (dump_enabled_p ())
 635     {
 636       dump_printf_loc (MSG_NOTE, vect_location,
 637                        "determined dependence between ");
 638       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 639       dump_printf (MSG_NOTE, " and ");
 640       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 641       dump_printf (MSG_NOTE,  "\n");
 642     }
 643
 644   return true;
 645 }
 646
 647
 648 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 649    contain the vector of scalar stores of this instance if we are
 650    disambiguating the loads.  */
 651
 652 static bool
 653 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 654                                    vec<gimple *> stores, gimple *last_store)
 655 {
 656   /* This walks over all stmts involved in the SLP load/store done
 657      in NODE verifying we can sink them up to the last stmt in the
 658      group.  */
 659   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 660   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 661     {
 662       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 663       if (access == last_access)
 664         continue;
 665       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 666       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 667            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 668         {
 669           gimple *stmt = gsi_stmt (gsi);
 670           if (! gimple_vuse (stmt)
 671               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 672             continue;
 673
 674           /* If we couldn't record a (single) data reference for this
 675              stmt we have to give up.  */
 676           /* ???  Here and below if dependence analysis fails we can resort
 677              to the alias oracle which can handle more kinds of stmts.  */
 678           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 679           if (!dr_b)
 680             return false;
 681
 682           bool dependent = false;
 683           /* If we run into a store of this same instance (we've just
 684              marked those) then delay dependence checking until we run
 685              into the last store because this is where it will have
 686              been sunk to (and we verify if we can do that as well).  */
 687           if (gimple_visited_p (stmt))
 688             {
 689               if (stmt != last_store)
 690                 continue;
 691               unsigned i;
 692               gimple *store;
 693               FOR_EACH_VEC_ELT (stores, i, store)
 694                 {
 695                   data_reference *store_dr
 696                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 697                   ddr_p ddr = initialize_data_dependence_relation
 698                                 (dr_a, store_dr, vNULL);
 699                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 700                   free_dependence_relation (ddr);
 701                   if (dependent)
 702                     break;
 703                 }
 704             }
 705           else
 706             {
 707               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 708                                                                dr_b, vNULL);
 709               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 710               free_dependence_relation (ddr);
 711             }
 712           if (dependent)
 713             return false;
 714         }
 715     }
 716   return true;
 717 }
 718
 719
 720 /* Function vect_analyze_data_ref_dependences.
 721
 722    Examine all the data references in the basic-block, and make sure there
 723    do not exist any data dependences between them.  Set *MAX_VF according to
 724    the maximum vectorization factor the data dependences allow.  */
 725
 726 bool
 727 vect_slp_analyze_instance_dependence (slp_instance instance)
 728 {
 729   if (dump_enabled_p ())
 730     dump_printf_loc (MSG_NOTE, vect_location,
 731                      "=== vect_slp_analyze_instance_dependence ===\n");
 732
 733   /* The stores of this instance are at the root of the SLP tree.  */
 734   slp_tree store = SLP_INSTANCE_TREE (instance);
 735   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 736     store = NULL;
 737
 738   /* Verify we can sink stores to the vectorized stmt insert location.  */
 739   gimple *last_store = NULL;
 740   if (store)
 741     {
 742       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 743         return false;
 744
 745       /* Mark stores in this instance and remember the last one.  */
 746       last_store = vect_find_last_scalar_stmt_in_slp (store);
 747       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 748         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 749     }
 750
 751   bool res = true;
 752
 753   /* Verify we can sink loads to the vectorized stmt insert location,
 754      special-casing stores of this instance.  */
 755   slp_tree load;
 756   unsigned int i;
 757   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 758     if (! vect_slp_analyze_node_dependences (instance, load,
 759                                              store
 760                                              ? SLP_TREE_SCALAR_STMTS (store)
 761                                              : vNULL, last_store))
 762       {
 763         res = false;
 764         break;
 765       }
 766
 767   /* Unset the visited flag.  */
 768   if (store)
 769     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 770       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 771
 772   return res;
 773 }
 774
 775 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 776    the statement that contains DRB, which is useful for recording in the
 777    dump file.  */
 778
 779 static void
 780 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 781                             innermost_loop_behavior *drb)
 782 {
 783   bool existed;
 784   innermost_loop_behavior *&entry
 785     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 786   if (!existed || entry->base_alignment < drb->base_alignment)
 787     {
 788       entry = drb;
 789       if (dump_enabled_p ())
 790         {
 791           dump_printf_loc (MSG_NOTE, vect_location,
 792                            "recording new base alignment for ");
 793           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 794           dump_printf (MSG_NOTE, "\n");
 795           dump_printf_loc (MSG_NOTE, vect_location,
 796                            "  alignment:    %d\n", drb->base_alignment);
 797           dump_printf_loc (MSG_NOTE, vect_location,
 798                            "  misalignment: %d\n", drb->base_misalignment);
 799           dump_printf_loc (MSG_NOTE, vect_location,
 800                            "  based on:     ");
 801           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 802         }
 803     }
 804 }
 805
 806 /* If the region we're going to vectorize is reached, all unconditional
 807    data references occur at least once.  We can therefore pool the base
 808    alignment guarantees from each unconditional reference.  Do this by
 809    going through all the data references in VINFO and checking whether
 810    the containing statement makes the reference unconditionally.  If so,
 811    record the alignment of the base address in VINFO so that it can be
 812    used for all other references with the same base.  */
 813
 814 void
 815 vect_record_base_alignments (vec_info *vinfo)
 816 {
 817   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 818   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 819   data_reference *dr;
 820   unsigned int i;
 821   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 822     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 823       {
 824         gimple *stmt = DR_STMT (dr);
 825         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 826
 827         /* If DR is nested in the loop that is being vectorized, we can also
 828            record the alignment of the base wrt the outer loop.  */
 829         if (loop && nested_in_vect_loop_p (loop, stmt))
 830           {
 831             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 832             vect_record_base_alignment
 833               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 834           }
 835       }
 836 }
 837
 838 /* Return the target alignment for the vectorized form of DR.  */
 839
 840 static unsigned int
 841 vect_calculate_target_alignment (struct data_reference *dr)
 842 {
 843   gimple *stmt = DR_STMT (dr);
 844   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 845   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 846   return targetm.vectorize.preferred_vector_alignment (vectype);
 847 }
 848
 849 /* Function vect_compute_data_ref_alignment
 850
 851    Compute the misalignment of the data reference DR.
 852
 853    Output:
 854    1. If during the misalignment computation it is found that the data reference
 855       cannot be vectorized then false is returned.
 856    2. DR_MISALIGNMENT (DR) is defined.
 857
 858    FOR NOW: No analysis is actually performed. Misalignment is calculated
 859    only for trivial cases. TODO.  */
 860
 861 bool
 862 vect_compute_data_ref_alignment (struct data_reference *dr)
 863 {
 864   gimple *stmt = DR_STMT (dr);
 865   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 866   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 867   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 868   struct loop *loop = NULL;
 869   tree ref = DR_REF (dr);
 870   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 871
 872   if (dump_enabled_p ())
 873     dump_printf_loc (MSG_NOTE, vect_location,
 874                      "vect_compute_data_ref_alignment:\n");
 875
 876   if (loop_vinfo)
 877     loop = LOOP_VINFO_LOOP (loop_vinfo);
 878
 879   /* Initialize misalignment to unknown.  */
 880   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 881
 882   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 883   bool step_preserves_misalignment_p;
 884
 885   unsigned HOST_WIDE_INT vector_alignment
 886     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 887   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 888
 889   /* No step for BB vectorization.  */
 890   if (!loop)
 891     {
 892       gcc_assert (integer_zerop (drb->step));
 893       step_preserves_misalignment_p = true;
 894     }
 895
 896   /* In case the dataref is in an inner-loop of the loop that is being
 897      vectorized (LOOP), we use the base and misalignment information
 898      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 899      stays the same throughout the execution of the inner-loop, which is why
 900      we have to check that the stride of the dataref in the inner-loop evenly
 901      divides by the vector alignment.  */
 902   else if (nested_in_vect_loop_p (loop, stmt))
 903     {
 904       step_preserves_misalignment_p
 905         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 906
 907       if (dump_enabled_p ())
 908         {
 909           if (step_preserves_misalignment_p)
 910             dump_printf_loc (MSG_NOTE, vect_location,
 911                              "inner step divides the vector alignment.\n");
 912           else
 913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 914                              "inner step doesn't divide the vector"
 915                              " alignment.\n");
 916         }
 917     }
 918
 919   /* Similarly we can only use base and misalignment information relative to
 920      an innermost loop if the misalignment stays the same throughout the
 921      execution of the loop.  As above, this is the case if the stride of
 922      the dataref evenly divides by the alignment.  */
 923   else
 924     {
 925       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 926       step_preserves_misalignment_p
 927         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 928
 929       if (!step_preserves_misalignment_p && dump_enabled_p ())
 930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 931                          "step doesn't divide the vector alignment.\n");
 932     }
 933
 934   unsigned int base_alignment = drb->base_alignment;
 935   unsigned int base_misalignment = drb->base_misalignment;
 936
 937   /* Calculate the maximum of the pooled base address alignment and the
 938      alignment that we can compute for DR itself.  */
 939   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 940   if (entry && base_alignment < (*entry)->base_alignment)
 941     {
 942       base_alignment = (*entry)->base_alignment;
 943       base_misalignment = (*entry)->base_misalignment;
 944     }
 945
 946   if (drb->offset_alignment < vector_alignment
 947       || !step_preserves_misalignment_p
 948       /* We need to know whether the step wrt the vectorized loop is
 949          negative when computing the starting misalignment below.  */
 950       || TREE_CODE (drb->step) != INTEGER_CST)
 951     {
 952       if (dump_enabled_p ())
 953         {
 954           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 955                            "Unknown alignment for access: ");
 956           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 957           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 958         }
 959       return true;
 960     }
 961
 962   if (base_alignment < vector_alignment)
 963     {
 964       unsigned int max_alignment;
 965       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 966       if (max_alignment < vector_alignment
 967           || !vect_can_force_dr_alignment_p (base,
 968                                              vector_alignment * BITS_PER_UNIT))
 969         {
 970           if (dump_enabled_p ())
 971             {
 972               dump_printf_loc (MSG_NOTE, vect_location,
 973                                "can't force alignment of ref: ");
 974               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 975               dump_printf (MSG_NOTE, "\n");
 976             }
 977           return true;
 978         }
 979
 980       /* Force the alignment of the decl.
 981          NOTE: This is the only change to the code we make during
 982          the analysis phase, before deciding to vectorize the loop.  */
 983       if (dump_enabled_p ())
 984         {
 985           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 986           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 987           dump_printf (MSG_NOTE, "\n");
 988         }
 989
 990       DR_VECT_AUX (dr)->base_decl = base;
 991       DR_VECT_AUX (dr)->base_misaligned = true;
 992       base_misalignment = 0;
 993     }
 994   poly_int64 misalignment
 995     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 996
 997   /* If this is a backward running DR then first access in the larger
 998      vectype actually is N-1 elements before the address in the DR.
 999      Adjust misalign accordingly.  */
1000   if (tree_int_cst_sgn (drb->step) < 0)
1001     /* PLUS because STEP is negative.  */
1002     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1003                      * TREE_INT_CST_LOW (drb->step));
1004
1005   unsigned int const_misalignment;
1006   if (!known_misalignment (misalignment, vector_alignment,
1007                            &const_misalignment))
1008     {
1009       if (dump_enabled_p ())
1010         {
1011           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012                            "Non-constant misalignment for access: ");
1013           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1014           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1015         }
1016       return true;
1017     }
1018
1019   SET_DR_MISALIGNMENT (dr, const_misalignment);
1020
1021   if (dump_enabled_p ())
1022     {
1023       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1025       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027     }
1028
1029   return true;
1030 }
1031
1032 /* Function vect_update_misalignment_for_peel.
1033    Sets DR's misalignment
1034    - to 0 if it has the same alignment as DR_PEEL,
1035    - to the misalignment computed using NPEEL if DR's salignment is known,
1036    - to -1 (unknown) otherwise.
1037
1038    DR - the data reference whose misalignment is to be adjusted.
1039    DR_PEEL - the data reference whose misalignment is being made
1040              zero in the vector loop by the peel.
1041    NPEEL - the number of iterations in the peel loop if the misalignment
1042            of DR_PEEL is known at compile time.  */
1043
1044 static void
1045 vect_update_misalignment_for_peel (struct data_reference *dr,
1046                                    struct data_reference *dr_peel, int npeel)
1047 {
1048   unsigned int i;
1049   vec<dr_p> same_aligned_drs;
1050   struct data_reference *current_dr;
1051   int dr_size = vect_get_scalar_dr_size (dr);
1052   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1053   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1054   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1055
1056  /* For interleaved data accesses the step in the loop must be multiplied by
1057      the size of the interleaving group.  */
1058   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1059     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1060   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1061     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1062
1063   /* It can be assumed that the data refs with the same alignment as dr_peel
1064      are aligned in the vector loop.  */
1065   same_aligned_drs
1066     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1067   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1068     {
1069       if (current_dr != dr)
1070         continue;
1071       gcc_assert (!known_alignment_for_access_p (dr)
1072                   || !known_alignment_for_access_p (dr_peel)
1073                   || (DR_MISALIGNMENT (dr) / dr_size
1074                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1075       SET_DR_MISALIGNMENT (dr, 0);
1076       return;
1077     }
1078
1079   if (known_alignment_for_access_p (dr)
1080       && known_alignment_for_access_p (dr_peel))
1081     {
1082       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1083       int misal = DR_MISALIGNMENT (dr);
1084       misal += negative ? -npeel * dr_size : npeel * dr_size;
1085       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1086       SET_DR_MISALIGNMENT (dr, misal);
1087       return;
1088     }
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1092                      "to unknown (-1).\n");
1093   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1094 }
1095
1096
1097 /* Function verify_data_ref_alignment
1098
1099    Return TRUE if DR can be handled with respect to alignment.  */
1100
1101 static bool
1102 verify_data_ref_alignment (data_reference_p dr)
1103 {
1104   enum dr_alignment_support supportable_dr_alignment
1105     = vect_supportable_dr_alignment (dr, false);
1106   if (!supportable_dr_alignment)
1107     {
1108       if (dump_enabled_p ())
1109         {
1110           if (DR_IS_READ (dr))
1111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112                              "not vectorized: unsupported unaligned load.");
1113           else
1114             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115                              "not vectorized: unsupported unaligned "
1116                              "store.");
1117
1118           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1119                              DR_REF (dr));
1120           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1121         }
1122       return false;
1123     }
1124
1125   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127                      "Vectorizing an unaligned access.\n");
1128
1129   return true;
1130 }
1131
1132 /* Function vect_verify_datarefs_alignment
1133
1134    Return TRUE if all data references in the loop can be
1135    handled with respect to alignment.  */
1136
1137 bool
1138 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1139 {
1140   vec<data_reference_p> datarefs = vinfo->datarefs;
1141   struct data_reference *dr;
1142   unsigned int i;
1143
1144   FOR_EACH_VEC_ELT (datarefs, i, dr)
1145     {
1146       gimple *stmt = DR_STMT (dr);
1147       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148
1149       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1150         continue;
1151
1152       /* For interleaving, only the alignment of the first access matters.   */
1153       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1154           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1155         continue;
1156
1157       /* Strided accesses perform only component accesses, alignment is
1158          irrelevant for them.  */
1159       if (STMT_VINFO_STRIDED_P (stmt_info)
1160           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1161         continue;
1162
1163       if (! verify_data_ref_alignment (dr))
1164         return false;
1165     }
1166
1167   return true;
1168 }
1169
1170 /* Given an memory reference EXP return whether its alignment is less
1171    than its size.  */
1172
1173 static bool
1174 not_size_aligned (tree exp)
1175 {
1176   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1177     return true;
1178
1179   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1180           > get_object_alignment (exp));
1181 }
1182
1183 /* Function vector_alignment_reachable_p
1184
1185    Return true if vector alignment for DR is reachable by peeling
1186    a few loop iterations.  Return false otherwise.  */
1187
1188 static bool
1189 vector_alignment_reachable_p (struct data_reference *dr)
1190 {
1191   gimple *stmt = DR_STMT (dr);
1192   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1193   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1194
1195   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1196     {
1197       /* For interleaved access we peel only if number of iterations in
1198          the prolog loop ({VF - misalignment}), is a multiple of the
1199          number of the interleaved accesses.  */
1200       int elem_size, mis_in_elements;
1201
1202       /* FORNOW: handle only known alignment.  */
1203       if (!known_alignment_for_access_p (dr))
1204         return false;
1205
1206       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1207       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1208       elem_size = vector_element_size (vector_size, nelements);
1209       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1210
1211       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1212         return false;
1213     }
1214
1215   /* If misalignment is known at the compile time then allow peeling
1216      only if natural alignment is reachable through peeling.  */
1217   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1218     {
1219       HOST_WIDE_INT elmsize =
1220                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1221       if (dump_enabled_p ())
1222         {
1223           dump_printf_loc (MSG_NOTE, vect_location,
1224                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1225           dump_printf (MSG_NOTE,
1226                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1227         }
1228       if (DR_MISALIGNMENT (dr) % elmsize)
1229         {
1230           if (dump_enabled_p ())
1231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                              "data size does not divide the misalignment.\n");
1233           return false;
1234         }
1235     }
1236
1237   if (!known_alignment_for_access_p (dr))
1238     {
1239       tree type = TREE_TYPE (DR_REF (dr));
1240       bool is_packed = not_size_aligned (DR_REF (dr));
1241       if (dump_enabled_p ())
1242         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                          "Unknown misalignment, %snaturally aligned\n",
1244                          is_packed ? "not " : "");
1245       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1246     }
1247
1248   return true;
1249 }
1250
1251
1252 /* Calculate the cost of the memory access represented by DR.  */
1253
1254 static void
1255 vect_get_data_access_cost (struct data_reference *dr,
1256                            unsigned int *inside_cost,
1257                            unsigned int *outside_cost,
1258                            stmt_vector_for_cost *body_cost_vec,
1259                            stmt_vector_for_cost *prologue_cost_vec)
1260 {
1261   gimple *stmt = DR_STMT (dr);
1262   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1263   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1264   int ncopies;
1265
1266   if (PURE_SLP_STMT (stmt_info))
1267     ncopies = 1;
1268   else
1269     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1270
1271   if (DR_IS_READ (dr))
1272     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1273                         prologue_cost_vec, body_cost_vec, false);
1274   else
1275     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1276
1277   if (dump_enabled_p ())
1278     dump_printf_loc (MSG_NOTE, vect_location,
1279                      "vect_get_data_access_cost: inside_cost = %d, "
1280                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1281 }
1282
1283
1284 typedef struct _vect_peel_info
1285 {
1286   struct data_reference *dr;
1287   int npeel;
1288   unsigned int count;
1289 } *vect_peel_info;
1290
1291 typedef struct _vect_peel_extended_info
1292 {
1293   struct _vect_peel_info peel_info;
1294   unsigned int inside_cost;
1295   unsigned int outside_cost;
1296 } *vect_peel_extended_info;
1297
1298
1299 /* Peeling hashtable helpers.  */
1300
1301 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1302 {
1303   static inline hashval_t hash (const _vect_peel_info *);
1304   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1305 };
1306
1307 inline hashval_t
1308 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1309 {
1310   return (hashval_t) peel_info->npeel;
1311 }
1312
1313 inline bool
1314 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1315 {
1316   return (a->npeel == b->npeel);
1317 }
1318
1319
1320 /* Insert DR into peeling hash table with NPEEL as key.  */
1321
1322 static void
1323 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1324                           loop_vec_info loop_vinfo, struct data_reference *dr,
1325                           int npeel)
1326 {
1327   struct _vect_peel_info elem, *slot;
1328   _vect_peel_info **new_slot;
1329   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1330
1331   elem.npeel = npeel;
1332   slot = peeling_htab->find (&elem);
1333   if (slot)
1334     slot->count++;
1335   else
1336     {
1337       slot = XNEW (struct _vect_peel_info);
1338       slot->npeel = npeel;
1339       slot->dr = dr;
1340       slot->count = 1;
1341       new_slot = peeling_htab->find_slot (slot, INSERT);
1342       *new_slot = slot;
1343     }
1344
1345   if (!supportable_dr_alignment
1346       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1347     slot->count += VECT_MAX_COST;
1348 }
1349
1350
1351 /* Traverse peeling hash table to find peeling option that aligns maximum
1352    number of data accesses.  */
1353
1354 int
1355 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1356                                      _vect_peel_extended_info *max)
1357 {
1358   vect_peel_info elem = *slot;
1359
1360   if (elem->count > max->peel_info.count
1361       || (elem->count == max->peel_info.count
1362           && max->peel_info.npeel > elem->npeel))
1363     {
1364       max->peel_info.npeel = elem->npeel;
1365       max->peel_info.count = elem->count;
1366       max->peel_info.dr = elem->dr;
1367     }
1368
1369   return 1;
1370 }
1371
1372 /* Get the costs of peeling NPEEL iterations checking data access costs
1373    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1374    misalignment will be zero after peeling.  */
1375
1376 static void
1377 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1378                                 struct data_reference *dr0,
1379                                 unsigned int *inside_cost,
1380                                 unsigned int *outside_cost,
1381                                 stmt_vector_for_cost *body_cost_vec,
1382                                 stmt_vector_for_cost *prologue_cost_vec,
1383                                 unsigned int npeel,
1384                                 bool unknown_misalignment)
1385 {
1386   unsigned i;
1387   data_reference *dr;
1388
1389   FOR_EACH_VEC_ELT (datarefs, i, dr)
1390     {
1391       gimple *stmt = DR_STMT (dr);
1392       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1393       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1394         continue;
1395
1396       /* For interleaving, only the alignment of the first access
1397          matters.  */
1398       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1399           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1400         continue;
1401
1402       /* Strided accesses perform only component accesses, alignment is
1403          irrelevant for them.  */
1404       if (STMT_VINFO_STRIDED_P (stmt_info)
1405           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1406         continue;
1407
1408       int save_misalignment;
1409       save_misalignment = DR_MISALIGNMENT (dr);
1410       if (npeel == 0)
1411         ;
1412       else if (unknown_misalignment && dr == dr0)
1413         SET_DR_MISALIGNMENT (dr, 0);
1414       else
1415         vect_update_misalignment_for_peel (dr, dr0, npeel);
1416       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1417                                  body_cost_vec, prologue_cost_vec);
1418       SET_DR_MISALIGNMENT (dr, save_misalignment);
1419     }
1420 }
1421
1422 /* Traverse peeling hash table and calculate cost for each peeling option.
1423    Find the one with the lowest cost.  */
1424
1425 int
1426 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1427                                    _vect_peel_extended_info *min)
1428 {
1429   vect_peel_info elem = *slot;
1430   int dummy;
1431   unsigned int inside_cost = 0, outside_cost = 0;
1432   gimple *stmt = DR_STMT (elem->dr);
1433   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1434   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1435   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1436                        epilogue_cost_vec;
1437
1438   prologue_cost_vec.create (2);
1439   body_cost_vec.create (2);
1440   epilogue_cost_vec.create (2);
1441
1442   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1443                                   elem->dr, &inside_cost, &outside_cost,
1444                                   &body_cost_vec, &prologue_cost_vec,
1445                                   elem->npeel, false);
1446
1447   body_cost_vec.release ();
1448
1449   outside_cost += vect_get_known_peeling_cost
1450     (loop_vinfo, elem->npeel, &dummy,
1451      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1452      &prologue_cost_vec, &epilogue_cost_vec);
1453
1454   /* Prologue and epilogue costs are added to the target model later.
1455      These costs depend only on the scalar iteration cost, the
1456      number of peeling iterations finally chosen, and the number of
1457      misaligned statements.  So discard the information found here.  */
1458   prologue_cost_vec.release ();
1459   epilogue_cost_vec.release ();
1460
1461   if (inside_cost < min->inside_cost
1462       || (inside_cost == min->inside_cost
1463           && outside_cost < min->outside_cost))
1464     {
1465       min->inside_cost = inside_cost;
1466       min->outside_cost = outside_cost;
1467       min->peel_info.dr = elem->dr;
1468       min->peel_info.npeel = elem->npeel;
1469       min->peel_info.count = elem->count;
1470     }
1471
1472   return 1;
1473 }
1474
1475
1476 /* Choose best peeling option by traversing peeling hash table and either
1477    choosing an option with the lowest cost (if cost model is enabled) or the
1478    option that aligns as many accesses as possible.  */
1479
1480 static struct _vect_peel_extended_info
1481 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1482                                        loop_vec_info loop_vinfo)
1483 {
1484    struct _vect_peel_extended_info res;
1485
1486    res.peel_info.dr = NULL;
1487
1488    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1489      {
1490        res.inside_cost = INT_MAX;
1491        res.outside_cost = INT_MAX;
1492        peeling_htab->traverse <_vect_peel_extended_info *,
1493                                vect_peeling_hash_get_lowest_cost> (&res);
1494      }
1495    else
1496      {
1497        res.peel_info.count = 0;
1498        peeling_htab->traverse <_vect_peel_extended_info *,
1499                                vect_peeling_hash_get_most_frequent> (&res);
1500        res.inside_cost = 0;
1501        res.outside_cost = 0;
1502      }
1503
1504    return res;
1505 }
1506
1507 /* Return true if the new peeling NPEEL is supported.  */
1508
1509 static bool
1510 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1511                           unsigned npeel)
1512 {
1513   unsigned i;
1514   struct data_reference *dr = NULL;
1515   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1516   gimple *stmt;
1517   stmt_vec_info stmt_info;
1518   enum dr_alignment_support supportable_dr_alignment;
1519
1520   /* Ensure that all data refs can be vectorized after the peel.  */
1521   FOR_EACH_VEC_ELT (datarefs, i, dr)
1522     {
1523       int save_misalignment;
1524
1525       if (dr == dr0)
1526         continue;
1527
1528       stmt = DR_STMT (dr);
1529       stmt_info = vinfo_for_stmt (stmt);
1530       /* For interleaving, only the alignment of the first access
1531          matters.  */
1532       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1533           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1534         continue;
1535
1536       /* Strided accesses perform only component accesses, alignment is
1537          irrelevant for them.  */
1538       if (STMT_VINFO_STRIDED_P (stmt_info)
1539           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1540         continue;
1541
1542       save_misalignment = DR_MISALIGNMENT (dr);
1543       vect_update_misalignment_for_peel (dr, dr0, npeel);
1544       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1545       SET_DR_MISALIGNMENT (dr, save_misalignment);
1546
1547       if (!supportable_dr_alignment)
1548         return false;
1549     }
1550
1551   return true;
1552 }
1553
1554 /* Function vect_enhance_data_refs_alignment
1555
1556    This pass will use loop versioning and loop peeling in order to enhance
1557    the alignment of data references in the loop.
1558
1559    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1560    original loop is to be vectorized.  Any other loops that are created by
1561    the transformations performed in this pass - are not supposed to be
1562    vectorized.  This restriction will be relaxed.
1563
1564    This pass will require a cost model to guide it whether to apply peeling
1565    or versioning or a combination of the two.  For example, the scheme that
1566    intel uses when given a loop with several memory accesses, is as follows:
1567    choose one memory access ('p') which alignment you want to force by doing
1568    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1569    other accesses are not necessarily aligned, or (2) use loop versioning to
1570    generate one loop in which all accesses are aligned, and another loop in
1571    which only 'p' is necessarily aligned.
1572
1573    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1574    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1575    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1576
1577    Devising a cost model is the most critical aspect of this work.  It will
1578    guide us on which access to peel for, whether to use loop versioning, how
1579    many versions to create, etc.  The cost model will probably consist of
1580    generic considerations as well as target specific considerations (on
1581    powerpc for example, misaligned stores are more painful than misaligned
1582    loads).
1583
1584    Here are the general steps involved in alignment enhancements:
1585
1586      -- original loop, before alignment analysis:
1587         for (i=0; i<N; i++){
1588           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1589           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1590         }
1591
1592      -- After vect_compute_data_refs_alignment:
1593         for (i=0; i<N; i++){
1594           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1595           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1596         }
1597
1598      -- Possibility 1: we do loop versioning:
1599      if (p is aligned) {
1600         for (i=0; i<N; i++){    # loop 1A
1601           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1603         }
1604      }
1605      else {
1606         for (i=0; i<N; i++){    # loop 1B
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1609         }
1610      }
1611
1612      -- Possibility 2: we do loop peeling:
1613      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1614         x = q[i];
1615         p[i] = y;
1616      }
1617      for (i = 3; i < N; i++){   # loop 2A
1618         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1619         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1620      }
1621
1622      -- Possibility 3: combination of loop peeling and versioning:
1623      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1624         x = q[i];
1625         p[i] = y;
1626      }
1627      if (p is aligned) {
1628         for (i = 3; i<N; i++){  # loop 3A
1629           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1630           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1631         }
1632      }
1633      else {
1634         for (i = 3; i<N; i++){  # loop 3B
1635           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1636           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1637         }
1638      }
1639
1640      These loops are later passed to loop_transform to be vectorized.  The
1641      vectorizer will use the alignment information to guide the transformation
1642      (whether to generate regular loads/stores, or with special handling for
1643      misalignment).  */
1644
1645 bool
1646 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1647 {
1648   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1649   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1650   enum dr_alignment_support supportable_dr_alignment;
1651   struct data_reference *dr0 = NULL, *first_store = NULL;
1652   struct data_reference *dr;
1653   unsigned int i, j;
1654   bool do_peeling = false;
1655   bool do_versioning = false;
1656   bool stat;
1657   gimple *stmt;
1658   stmt_vec_info stmt_info;
1659   unsigned int npeel = 0;
1660   bool one_misalignment_known = false;
1661   bool one_misalignment_unknown = false;
1662   bool one_dr_unsupportable = false;
1663   struct data_reference *unsupportable_dr = NULL;
1664   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1665   unsigned possible_npeel_number = 1;
1666   tree vectype;
1667   unsigned int mis, same_align_drs_max = 0;
1668   hash_table<peel_info_hasher> peeling_htab (1);
1669
1670   if (dump_enabled_p ())
1671     dump_printf_loc (MSG_NOTE, vect_location,
1672                      "=== vect_enhance_data_refs_alignment ===\n");
1673
1674   /* Reset data so we can safely be called multiple times.  */
1675   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1676   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1677
1678   /* While cost model enhancements are expected in the future, the high level
1679      view of the code at this time is as follows:
1680
1681      A) If there is a misaligned access then see if peeling to align
1682         this access can make all data references satisfy
1683         vect_supportable_dr_alignment.  If so, update data structures
1684         as needed and return true.
1685
1686      B) If peeling wasn't possible and there is a data reference with an
1687         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1688         then see if loop versioning checks can be used to make all data
1689         references satisfy vect_supportable_dr_alignment.  If so, update
1690         data structures as needed and return true.
1691
1692      C) If neither peeling nor versioning were successful then return false if
1693         any data reference does not satisfy vect_supportable_dr_alignment.
1694
1695      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1696
1697      Note, Possibility 3 above (which is peeling and versioning together) is not
1698      being done at this time.  */
1699
1700   /* (1) Peeling to force alignment.  */
1701
1702   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1703      Considerations:
1704      + How many accesses will become aligned due to the peeling
1705      - How many accesses will become unaligned due to the peeling,
1706        and the cost of misaligned accesses.
1707      - The cost of peeling (the extra runtime checks, the increase
1708        in code size).  */
1709
1710   FOR_EACH_VEC_ELT (datarefs, i, dr)
1711     {
1712       stmt = DR_STMT (dr);
1713       stmt_info = vinfo_for_stmt (stmt);
1714
1715       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1716         continue;
1717
1718       /* For interleaving, only the alignment of the first access
1719          matters.  */
1720       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1721           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1722         continue;
1723
1724       /* For invariant accesses there is nothing to enhance.  */
1725       if (integer_zerop (DR_STEP (dr)))
1726         continue;
1727
1728       /* Strided accesses perform only component accesses, alignment is
1729          irrelevant for them.  */
1730       if (STMT_VINFO_STRIDED_P (stmt_info)
1731           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1732         continue;
1733
1734       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1735       do_peeling = vector_alignment_reachable_p (dr);
1736       if (do_peeling)
1737         {
1738           if (known_alignment_for_access_p (dr))
1739             {
1740               unsigned int npeel_tmp = 0;
1741               bool negative = tree_int_cst_compare (DR_STEP (dr),
1742                                                     size_zero_node) < 0;
1743
1744               vectype = STMT_VINFO_VECTYPE (stmt_info);
1745               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1746               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1747               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1748               if (DR_MISALIGNMENT (dr) != 0)
1749                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1750
1751               /* For multiple types, it is possible that the bigger type access
1752                  will have more than one peeling option.  E.g., a loop with two
1753                  types: one of size (vector size / 4), and the other one of
1754                  size (vector size / 8).  Vectorization factor will 8.  If both
1755                  accesses are misaligned by 3, the first one needs one scalar
1756                  iteration to be aligned, and the second one needs 5.  But the
1757                  first one will be aligned also by peeling 5 scalar
1758                  iterations, and in that case both accesses will be aligned.
1759                  Hence, except for the immediate peeling amount, we also want
1760                  to try to add full vector size, while we don't exceed
1761                  vectorization factor.
1762                  We do this automatically for cost model, since we calculate
1763                  cost for every peeling option.  */
1764               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1765                 {
1766                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1767                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1768                   possible_npeel_number
1769                     = vect_get_num_vectors (nscalars, vectype);
1770
1771                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1772                      allow peeling NELEMENTS.  */
1773                   if (DR_MISALIGNMENT (dr) == 0)
1774                     possible_npeel_number++;
1775                 }
1776
1777               /* Save info about DR in the hash table.  Also include peeling
1778                  amounts according to the explanation above.  */
1779               for (j = 0; j < possible_npeel_number; j++)
1780                 {
1781                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1782                                             dr, npeel_tmp);
1783                   npeel_tmp += target_align / dr_size;
1784                 }
1785
1786               one_misalignment_known = true;
1787             }
1788           else
1789             {
1790               /* If we don't know any misalignment values, we prefer
1791                  peeling for data-ref that has the maximum number of data-refs
1792                  with the same alignment, unless the target prefers to align
1793                  stores over load.  */
1794               unsigned same_align_drs
1795                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1796               if (!dr0
1797                   || same_align_drs_max < same_align_drs)
1798                 {
1799                   same_align_drs_max = same_align_drs;
1800                   dr0 = dr;
1801                 }
1802               /* For data-refs with the same number of related
1803                  accesses prefer the one where the misalign
1804                  computation will be invariant in the outermost loop.  */
1805               else if (same_align_drs_max == same_align_drs)
1806                 {
1807                   struct loop *ivloop0, *ivloop;
1808                   ivloop0 = outermost_invariant_loop_for_expr
1809                     (loop, DR_BASE_ADDRESS (dr0));
1810                   ivloop = outermost_invariant_loop_for_expr
1811                     (loop, DR_BASE_ADDRESS (dr));
1812                   if ((ivloop && !ivloop0)
1813                       || (ivloop && ivloop0
1814                           && flow_loop_nested_p (ivloop, ivloop0)))
1815                     dr0 = dr;
1816                 }
1817
1818               one_misalignment_unknown = true;
1819
1820               /* Check for data refs with unsupportable alignment that
1821                  can be peeled.  */
1822               if (!supportable_dr_alignment)
1823               {
1824                 one_dr_unsupportable = true;
1825                 unsupportable_dr = dr;
1826               }
1827
1828               if (!first_store && DR_IS_WRITE (dr))
1829                 first_store = dr;
1830             }
1831         }
1832       else
1833         {
1834           if (!aligned_access_p (dr))
1835             {
1836               if (dump_enabled_p ())
1837                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                                  "vector alignment may not be reachable\n");
1839               break;
1840             }
1841         }
1842     }
1843
1844   /* Check if we can possibly peel the loop.  */
1845   if (!vect_can_advance_ivs_p (loop_vinfo)
1846       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1847       || loop->inner)
1848     do_peeling = false;
1849
1850   struct _vect_peel_extended_info peel_for_known_alignment;
1851   struct _vect_peel_extended_info peel_for_unknown_alignment;
1852   struct _vect_peel_extended_info best_peel;
1853
1854   peel_for_unknown_alignment.inside_cost = INT_MAX;
1855   peel_for_unknown_alignment.outside_cost = INT_MAX;
1856   peel_for_unknown_alignment.peel_info.count = 0;
1857
1858   if (do_peeling
1859       && one_misalignment_unknown)
1860     {
1861       /* Check if the target requires to prefer stores over loads, i.e., if
1862          misaligned stores are more expensive than misaligned loads (taking
1863          drs with same alignment into account).  */
1864       unsigned int load_inside_cost = 0;
1865       unsigned int load_outside_cost = 0;
1866       unsigned int store_inside_cost = 0;
1867       unsigned int store_outside_cost = 0;
1868       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1869
1870       stmt_vector_for_cost dummy;
1871       dummy.create (2);
1872       vect_get_peeling_costs_all_drs (datarefs, dr0,
1873                                       &load_inside_cost,
1874                                       &load_outside_cost,
1875                                       &dummy, &dummy, estimated_npeels, true);
1876       dummy.release ();
1877
1878       if (first_store)
1879         {
1880           dummy.create (2);
1881           vect_get_peeling_costs_all_drs (datarefs, first_store,
1882                                           &store_inside_cost,
1883                                           &store_outside_cost,
1884                                           &dummy, &dummy,
1885                                           estimated_npeels, true);
1886           dummy.release ();
1887         }
1888       else
1889         {
1890           store_inside_cost = INT_MAX;
1891           store_outside_cost = INT_MAX;
1892         }
1893
1894       if (load_inside_cost > store_inside_cost
1895           || (load_inside_cost == store_inside_cost
1896               && load_outside_cost > store_outside_cost))
1897         {
1898           dr0 = first_store;
1899           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1900           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1901         }
1902       else
1903         {
1904           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1905           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1906         }
1907
1908       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1909       prologue_cost_vec.create (2);
1910       epilogue_cost_vec.create (2);
1911
1912       int dummy2;
1913       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1914         (loop_vinfo, estimated_npeels, &dummy2,
1915          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1916          &prologue_cost_vec, &epilogue_cost_vec);
1917
1918       prologue_cost_vec.release ();
1919       epilogue_cost_vec.release ();
1920
1921       peel_for_unknown_alignment.peel_info.count = 1
1922         + STMT_VINFO_SAME_ALIGN_REFS
1923         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1924     }
1925
1926   peel_for_unknown_alignment.peel_info.npeel = 0;
1927   peel_for_unknown_alignment.peel_info.dr = dr0;
1928
1929   best_peel = peel_for_unknown_alignment;
1930
1931   peel_for_known_alignment.inside_cost = INT_MAX;
1932   peel_for_known_alignment.outside_cost = INT_MAX;
1933   peel_for_known_alignment.peel_info.count = 0;
1934   peel_for_known_alignment.peel_info.dr = NULL;
1935
1936   if (do_peeling && one_misalignment_known)
1937     {
1938       /* Peeling is possible, but there is no data access that is not supported
1939          unless aligned.  So we try to choose the best possible peeling from
1940          the hash table.  */
1941       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1942         (&peeling_htab, loop_vinfo);
1943     }
1944
1945   /* Compare costs of peeling for known and unknown alignment. */
1946   if (peel_for_known_alignment.peel_info.dr != NULL
1947       && peel_for_unknown_alignment.inside_cost
1948       >= peel_for_known_alignment.inside_cost)
1949     {
1950       best_peel = peel_for_known_alignment;
1951
1952       /* If the best peeling for known alignment has NPEEL == 0, perform no
1953          peeling at all except if there is an unsupportable dr that we can
1954          align.  */
1955       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1956         do_peeling = false;
1957     }
1958
1959   /* If there is an unsupportable data ref, prefer this over all choices so far
1960      since we'd have to discard a chosen peeling except when it accidentally
1961      aligned the unsupportable data ref.  */
1962   if (one_dr_unsupportable)
1963     dr0 = unsupportable_dr;
1964   else if (do_peeling)
1965     {
1966       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1967          TODO: Use nopeel_outside_cost or get rid of it?  */
1968       unsigned nopeel_inside_cost = 0;
1969       unsigned nopeel_outside_cost = 0;
1970
1971       stmt_vector_for_cost dummy;
1972       dummy.create (2);
1973       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1974                                       &nopeel_outside_cost, &dummy, &dummy,
1975                                       0, false);
1976       dummy.release ();
1977
1978       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1979          costs will be recorded.  */
1980       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1981       prologue_cost_vec.create (2);
1982       epilogue_cost_vec.create (2);
1983
1984       int dummy2;
1985       nopeel_outside_cost += vect_get_known_peeling_cost
1986         (loop_vinfo, 0, &dummy2,
1987          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1988          &prologue_cost_vec, &epilogue_cost_vec);
1989
1990       prologue_cost_vec.release ();
1991       epilogue_cost_vec.release ();
1992
1993       npeel = best_peel.peel_info.npeel;
1994       dr0 = best_peel.peel_info.dr;
1995
1996       /* If no peeling is not more expensive than the best peeling we
1997          have so far, don't perform any peeling.  */
1998       if (nopeel_inside_cost <= best_peel.inside_cost)
1999         do_peeling = false;
2000     }
2001
2002   if (do_peeling)
2003     {
2004       stmt = DR_STMT (dr0);
2005       stmt_info = vinfo_for_stmt (stmt);
2006       vectype = STMT_VINFO_VECTYPE (stmt_info);
2007
2008       if (known_alignment_for_access_p (dr0))
2009         {
2010           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2011                                                 size_zero_node) < 0;
2012           if (!npeel)
2013             {
2014               /* Since it's known at compile time, compute the number of
2015                  iterations in the peeled loop (the peeling factor) for use in
2016                  updating DR_MISALIGNMENT values.  The peeling factor is the
2017                  vectorization factor minus the misalignment as an element
2018                  count.  */
2019               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2020               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2021               npeel = ((mis & (target_align - 1))
2022                        / vect_get_scalar_dr_size (dr0));
2023             }
2024
2025           /* For interleaved data access every iteration accesses all the
2026              members of the group, therefore we divide the number of iterations
2027              by the group size.  */
2028           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2029           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030             npeel /= GROUP_SIZE (stmt_info);
2031
2032           if (dump_enabled_p ())
2033             dump_printf_loc (MSG_NOTE, vect_location,
2034                              "Try peeling by %d\n", npeel);
2035         }
2036
2037       /* Ensure that all datarefs can be vectorized after the peel.  */
2038       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2039         do_peeling = false;
2040
2041       /* Check if all datarefs are supportable and log.  */
2042       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2043         {
2044           stat = vect_verify_datarefs_alignment (loop_vinfo);
2045           if (!stat)
2046             do_peeling = false;
2047           else
2048             return stat;
2049         }
2050
2051       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2052       if (do_peeling)
2053         {
2054           unsigned max_allowed_peel
2055             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2056           if (max_allowed_peel != (unsigned)-1)
2057             {
2058               unsigned max_peel = npeel;
2059               if (max_peel == 0)
2060                 {
2061                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2062                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2063                 }
2064               if (max_peel > max_allowed_peel)
2065                 {
2066                   do_peeling = false;
2067                   if (dump_enabled_p ())
2068                     dump_printf_loc (MSG_NOTE, vect_location,
2069                         "Disable peeling, max peels reached: %d\n", max_peel);
2070                 }
2071             }
2072         }
2073
2074       /* Cost model #2 - if peeling may result in a remaining loop not
2075          iterating enough to be vectorized then do not peel.  Since this
2076          is a cost heuristic rather than a correctness decision, use the
2077          most likely runtime value for variable vectorization factors.  */
2078       if (do_peeling
2079           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2080         {
2081           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2082           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2083           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2084               < assumed_vf + max_peel)
2085             do_peeling = false;
2086         }
2087
2088       if (do_peeling)
2089         {
2090           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2091              If the misalignment of DR_i is identical to that of dr0 then set
2092              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2093              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2094              by the peeling factor times the element size of DR_i (MOD the
2095              vectorization factor times the size).  Otherwise, the
2096              misalignment of DR_i must be set to unknown.  */
2097           FOR_EACH_VEC_ELT (datarefs, i, dr)
2098             if (dr != dr0)
2099               {
2100                 /* Strided accesses perform only component accesses, alignment
2101                    is irrelevant for them.  */
2102                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2103                 if (STMT_VINFO_STRIDED_P (stmt_info)
2104                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2105                   continue;
2106
2107                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2108               }
2109
2110           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2111           if (npeel)
2112             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2113           else
2114             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2115               = DR_MISALIGNMENT (dr0);
2116           SET_DR_MISALIGNMENT (dr0, 0);
2117           if (dump_enabled_p ())
2118             {
2119               dump_printf_loc (MSG_NOTE, vect_location,
2120                                "Alignment of access forced using peeling.\n");
2121               dump_printf_loc (MSG_NOTE, vect_location,
2122                                "Peeling for alignment will be applied.\n");
2123             }
2124
2125           /* The inside-loop cost will be accounted for in vectorizable_load
2126              and vectorizable_store correctly with adjusted alignments.
2127              Drop the body_cst_vec on the floor here.  */
2128           stat = vect_verify_datarefs_alignment (loop_vinfo);
2129           gcc_assert (stat);
2130           return stat;
2131         }
2132     }
2133
2134   /* (2) Versioning to force alignment.  */
2135
2136   /* Try versioning if:
2137      1) optimize loop for speed
2138      2) there is at least one unsupported misaligned data ref with an unknown
2139         misalignment, and
2140      3) all misaligned data refs with a known misalignment are supported, and
2141      4) the number of runtime alignment checks is within reason.  */
2142
2143   do_versioning =
2144         optimize_loop_nest_for_speed_p (loop)
2145         && (!loop->inner); /* FORNOW */
2146
2147   if (do_versioning)
2148     {
2149       FOR_EACH_VEC_ELT (datarefs, i, dr)
2150         {
2151           stmt = DR_STMT (dr);
2152           stmt_info = vinfo_for_stmt (stmt);
2153
2154           /* For interleaving, only the alignment of the first access
2155              matters.  */
2156           if (aligned_access_p (dr)
2157               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2158                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2159             continue;
2160
2161           if (STMT_VINFO_STRIDED_P (stmt_info))
2162             {
2163               /* Strided loads perform only component accesses, alignment is
2164                  irrelevant for them.  */
2165               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2166                 continue;
2167               do_versioning = false;
2168               break;
2169             }
2170
2171           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2172
2173           if (!supportable_dr_alignment)
2174             {
2175               gimple *stmt;
2176               int mask;
2177               tree vectype;
2178
2179               if (known_alignment_for_access_p (dr)
2180                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2181                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2182                 {
2183                   do_versioning = false;
2184                   break;
2185                 }
2186
2187               stmt = DR_STMT (dr);
2188               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2189               gcc_assert (vectype);
2190
2191               /* At present we don't support versioning for alignment
2192                  with variable VF, since there's no guarantee that the
2193                  VF is a power of two.  We could relax this if we added
2194                  a way of enforcing a power-of-two size.  */
2195               unsigned HOST_WIDE_INT size;
2196               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2197                 {
2198                   do_versioning = false;
2199                   break;
2200                 }
2201
2202               /* The rightmost bits of an aligned address must be zeros.
2203                  Construct the mask needed for this test.  For example,
2204                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2205                  mask must be 15 = 0xf. */
2206               mask = size - 1;
2207
2208               /* FORNOW: use the same mask to test all potentially unaligned
2209                  references in the loop.  The vectorizer currently supports
2210                  a single vector size, see the reference to
2211                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2212                  vectorization factor is computed.  */
2213               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2214                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2215               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2216               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2217                       DR_STMT (dr));
2218             }
2219         }
2220
2221       /* Versioning requires at least one misaligned data reference.  */
2222       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2223         do_versioning = false;
2224       else if (!do_versioning)
2225         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2226     }
2227
2228   if (do_versioning)
2229     {
2230       vec<gimple *> may_misalign_stmts
2231         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2232       gimple *stmt;
2233
2234       /* It can now be assumed that the data references in the statements
2235          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2236          of the loop being vectorized.  */
2237       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2238         {
2239           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2240           dr = STMT_VINFO_DATA_REF (stmt_info);
2241           SET_DR_MISALIGNMENT (dr, 0);
2242           if (dump_enabled_p ())
2243             dump_printf_loc (MSG_NOTE, vect_location,
2244                              "Alignment of access forced using versioning.\n");
2245         }
2246
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_NOTE, vect_location,
2249                          "Versioning for alignment will be applied.\n");
2250
2251       /* Peeling and versioning can't be done together at this time.  */
2252       gcc_assert (! (do_peeling && do_versioning));
2253
2254       stat = vect_verify_datarefs_alignment (loop_vinfo);
2255       gcc_assert (stat);
2256       return stat;
2257     }
2258
2259   /* This point is reached if neither peeling nor versioning is being done.  */
2260   gcc_assert (! (do_peeling || do_versioning));
2261
2262   stat = vect_verify_datarefs_alignment (loop_vinfo);
2263   return stat;
2264 }
2265
2266
2267 /* Function vect_find_same_alignment_drs.
2268
2269    Update group and alignment relations according to the chosen
2270    vectorization factor.  */
2271
2272 static void
2273 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2274 {
2275   struct data_reference *dra = DDR_A (ddr);
2276   struct data_reference *drb = DDR_B (ddr);
2277   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2278   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2279
2280   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2281     return;
2282
2283   if (dra == drb)
2284     return;
2285
2286   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2287       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2288       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2289     return;
2290
2291   /* Two references with distance zero have the same alignment.  */
2292   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2293                           - wi::to_poly_offset (DR_INIT (drb)));
2294   if (maybe_ne (diff, 0))
2295     {
2296       /* Get the wider of the two alignments.  */
2297       unsigned int align_a = (vect_calculate_target_alignment (dra)
2298                               / BITS_PER_UNIT);
2299       unsigned int align_b = (vect_calculate_target_alignment (drb)
2300                               / BITS_PER_UNIT);
2301       unsigned int max_align = MAX (align_a, align_b);
2302
2303       /* Require the gap to be a multiple of the larger vector alignment.  */
2304       if (!multiple_p (diff, max_align))
2305         return;
2306     }
2307
2308   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2309   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2310   if (dump_enabled_p ())
2311     {
2312       dump_printf_loc (MSG_NOTE, vect_location,
2313                        "accesses have the same alignment: ");
2314       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2315       dump_printf (MSG_NOTE,  " and ");
2316       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2317       dump_printf (MSG_NOTE, "\n");
2318     }
2319 }
2320
2321
2322 /* Function vect_analyze_data_refs_alignment
2323
2324    Analyze the alignment of the data-references in the loop.
2325    Return FALSE if a data reference is found that cannot be vectorized.  */
2326
2327 bool
2328 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2329 {
2330   if (dump_enabled_p ())
2331     dump_printf_loc (MSG_NOTE, vect_location,
2332                      "=== vect_analyze_data_refs_alignment ===\n");
2333
2334   /* Mark groups of data references with same alignment using
2335      data dependence information.  */
2336   vec<ddr_p> ddrs = vinfo->ddrs;
2337   struct data_dependence_relation *ddr;
2338   unsigned int i;
2339
2340   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2341     vect_find_same_alignment_drs (ddr);
2342
2343   vec<data_reference_p> datarefs = vinfo->datarefs;
2344   struct data_reference *dr;
2345
2346   vect_record_base_alignments (vinfo);
2347   FOR_EACH_VEC_ELT (datarefs, i, dr)
2348     {
2349       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2350       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2351           && !vect_compute_data_ref_alignment (dr))
2352         {
2353           /* Strided accesses perform only component accesses, misalignment
2354              information is irrelevant for them.  */
2355           if (STMT_VINFO_STRIDED_P (stmt_info)
2356               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2357             continue;
2358
2359           if (dump_enabled_p ())
2360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                              "not vectorized: can't calculate alignment "
2362                              "for data ref.\n");
2363
2364           return false;
2365         }
2366     }
2367
2368   return true;
2369 }
2370
2371
2372 /* Analyze alignment of DRs of stmts in NODE.  */
2373
2374 static bool
2375 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2376 {
2377   /* We vectorize from the first scalar stmt in the node unless
2378      the node is permuted in which case we start from the first
2379      element in the group.  */
2380   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2381   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2382   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2383     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2384
2385   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2386   if (! vect_compute_data_ref_alignment (dr)
2387       /* For creating the data-ref pointer we need alignment of the
2388          first element anyway.  */
2389       || (dr != first_dr
2390           && ! vect_compute_data_ref_alignment (first_dr))
2391       || ! verify_data_ref_alignment (dr))
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                          "not vectorized: bad data alignment in basic "
2396                          "block.\n");
2397       return false;
2398     }
2399
2400   return true;
2401 }
2402
2403 /* Function vect_slp_analyze_instance_alignment
2404
2405    Analyze the alignment of the data-references in the SLP instance.
2406    Return FALSE if a data reference is found that cannot be vectorized.  */
2407
2408 bool
2409 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2410 {
2411   if (dump_enabled_p ())
2412     dump_printf_loc (MSG_NOTE, vect_location,
2413                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2414
2415   slp_tree node;
2416   unsigned i;
2417   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2418     if (! vect_slp_analyze_and_verify_node_alignment (node))
2419       return false;
2420
2421   node = SLP_INSTANCE_TREE (instance);
2422   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2423       && ! vect_slp_analyze_and_verify_node_alignment
2424              (SLP_INSTANCE_TREE (instance)))
2425     return false;
2426
2427   return true;
2428 }
2429
2430
2431 /* Analyze groups of accesses: check that DR belongs to a group of
2432    accesses of legal size, step, etc.  Detect gaps, single element
2433    interleaving, and other special cases. Set grouped access info.
2434    Collect groups of strided stores for further use in SLP analysis.
2435    Worker for vect_analyze_group_access.  */
2436
2437 static bool
2438 vect_analyze_group_access_1 (struct data_reference *dr)
2439 {
2440   tree step = DR_STEP (dr);
2441   tree scalar_type = TREE_TYPE (DR_REF (dr));
2442   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2443   gimple *stmt = DR_STMT (dr);
2444   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2445   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2446   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2447   HOST_WIDE_INT dr_step = -1;
2448   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2449   bool slp_impossible = false;
2450
2451   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2452      size of the interleaving group (including gaps).  */
2453   if (tree_fits_shwi_p (step))
2454     {
2455       dr_step = tree_to_shwi (step);
2456       /* Check that STEP is a multiple of type size.  Otherwise there is
2457          a non-element-sized gap at the end of the group which we
2458          cannot represent in GROUP_GAP or GROUP_SIZE.
2459          ???  As we can handle non-constant step fine here we should
2460          simply remove uses of GROUP_GAP between the last and first
2461          element and instead rely on DR_STEP.  GROUP_SIZE then would
2462          simply not include that gap.  */
2463       if ((dr_step % type_size) != 0)
2464         {
2465           if (dump_enabled_p ())
2466             {
2467               dump_printf_loc (MSG_NOTE, vect_location,
2468                                "Step ");
2469               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2470               dump_printf (MSG_NOTE,
2471                            " is not a multiple of the element size for ");
2472               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2473               dump_printf (MSG_NOTE, "\n");
2474             }
2475           return false;
2476         }
2477       groupsize = absu_hwi (dr_step) / type_size;
2478     }
2479   else
2480     groupsize = 0;
2481
2482   /* Not consecutive access is possible only if it is a part of interleaving.  */
2483   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2484     {
2485       /* Check if it this DR is a part of interleaving, and is a single
2486          element of the group that is accessed in the loop.  */
2487
2488       /* Gaps are supported only for loads. STEP must be a multiple of the type
2489          size.  */
2490       if (DR_IS_READ (dr)
2491           && (dr_step % type_size) == 0
2492           && groupsize > 0)
2493         {
2494           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2495           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2496           GROUP_GAP (stmt_info) = groupsize - 1;
2497           if (dump_enabled_p ())
2498             {
2499               dump_printf_loc (MSG_NOTE, vect_location,
2500                                "Detected single element interleaving ");
2501               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2502               dump_printf (MSG_NOTE, " step ");
2503               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2504               dump_printf (MSG_NOTE, "\n");
2505             }
2506
2507           return true;
2508         }
2509
2510       if (dump_enabled_p ())
2511         {
2512           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2513                            "not consecutive access ");
2514           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2515         }
2516
2517       if (bb_vinfo)
2518         {
2519           /* Mark the statement as unvectorizable.  */
2520           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2521           return true;
2522         }
2523
2524       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2525       STMT_VINFO_STRIDED_P (stmt_info) = true;
2526       return true;
2527     }
2528
2529   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2530     {
2531       /* First stmt in the interleaving chain. Check the chain.  */
2532       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2533       struct data_reference *data_ref = dr;
2534       unsigned int count = 1;
2535       tree prev_init = DR_INIT (data_ref);
2536       gimple *prev = stmt;
2537       HOST_WIDE_INT diff, gaps = 0;
2538
2539       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2540       while (next)
2541         {
2542           /* Skip same data-refs.  In case that two or more stmts share
2543              data-ref (supported only for loads), we vectorize only the first
2544              stmt, and the rest get their vectorized loads from the first
2545              one.  */
2546           if (!tree_int_cst_compare (DR_INIT (data_ref),
2547                                      DR_INIT (STMT_VINFO_DATA_REF (
2548                                                    vinfo_for_stmt (next)))))
2549             {
2550               if (DR_IS_WRITE (data_ref))
2551                 {
2552                   if (dump_enabled_p ())
2553                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                                      "Two store stmts share the same dr.\n");
2555                   return false;
2556                 }
2557
2558               if (dump_enabled_p ())
2559                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2560                                  "Two or more load stmts share the same dr.\n");
2561
2562               /* For load use the same data-ref load.  */
2563               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2564
2565               prev = next;
2566               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2567               continue;
2568             }
2569
2570           prev = next;
2571           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2572
2573           /* All group members have the same STEP by construction.  */
2574           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2575
2576           /* Check that the distance between two accesses is equal to the type
2577              size. Otherwise, we have gaps.  */
2578           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2579                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2580           if (diff != 1)
2581             {
2582               /* FORNOW: SLP of accesses with gaps is not supported.  */
2583               slp_impossible = true;
2584               if (DR_IS_WRITE (data_ref))
2585                 {
2586                   if (dump_enabled_p ())
2587                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2588                                      "interleaved store with gaps\n");
2589                   return false;
2590                 }
2591
2592               gaps += diff - 1;
2593             }
2594
2595           last_accessed_element += diff;
2596
2597           /* Store the gap from the previous member of the group. If there is no
2598              gap in the access, GROUP_GAP is always 1.  */
2599           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2600
2601           prev_init = DR_INIT (data_ref);
2602           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2603           /* Count the number of data-refs in the chain.  */
2604           count++;
2605         }
2606
2607       if (groupsize == 0)
2608         groupsize = count + gaps;
2609
2610       /* This could be UINT_MAX but as we are generating code in a very
2611          inefficient way we have to cap earlier.  See PR78699 for example.  */
2612       if (groupsize > 4096)
2613         {
2614           if (dump_enabled_p ())
2615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616                              "group is too large\n");
2617           return false;
2618         }
2619
2620       /* Check that the size of the interleaving is equal to count for stores,
2621          i.e., that there are no gaps.  */
2622       if (groupsize != count
2623           && !DR_IS_READ (dr))
2624         {
2625           if (dump_enabled_p ())
2626             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627                              "interleaved store with gaps\n");
2628           return false;
2629         }
2630
2631       /* If there is a gap after the last load in the group it is the
2632          difference between the groupsize and the last accessed
2633          element.
2634          When there is no gap, this difference should be 0.  */
2635       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2636
2637       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2638       if (dump_enabled_p ())
2639         {
2640           dump_printf_loc (MSG_NOTE, vect_location,
2641                            "Detected interleaving ");
2642           if (DR_IS_READ (dr))
2643             dump_printf (MSG_NOTE, "load ");
2644           else
2645             dump_printf (MSG_NOTE, "store ");
2646           dump_printf (MSG_NOTE, "of size %u starting with ",
2647                        (unsigned)groupsize);
2648           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2649           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2650             dump_printf_loc (MSG_NOTE, vect_location,
2651                              "There is a gap of %u elements after the group\n",
2652                              GROUP_GAP (vinfo_for_stmt (stmt)));
2653         }
2654
2655       /* SLP: create an SLP data structure for every interleaving group of
2656          stores for further analysis in vect_analyse_slp.  */
2657       if (DR_IS_WRITE (dr) && !slp_impossible)
2658         {
2659           if (loop_vinfo)
2660             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2661           if (bb_vinfo)
2662             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2663         }
2664     }
2665
2666   return true;
2667 }
2668
2669 /* Analyze groups of accesses: check that DR belongs to a group of
2670    accesses of legal size, step, etc.  Detect gaps, single element
2671    interleaving, and other special cases. Set grouped access info.
2672    Collect groups of strided stores for further use in SLP analysis.  */
2673
2674 static bool
2675 vect_analyze_group_access (struct data_reference *dr)
2676 {
2677   if (!vect_analyze_group_access_1 (dr))
2678     {
2679       /* Dissolve the group if present.  */
2680       gimple *next;
2681       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2682       while (stmt)
2683         {
2684           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2685           next = GROUP_NEXT_ELEMENT (vinfo);
2686           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2687           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2688           stmt = next;
2689         }
2690       return false;
2691     }
2692   return true;
2693 }
2694
2695 /* Analyze the access pattern of the data-reference DR.
2696    In case of non-consecutive accesses call vect_analyze_group_access() to
2697    analyze groups of accesses.  */
2698
2699 static bool
2700 vect_analyze_data_ref_access (struct data_reference *dr)
2701 {
2702   tree step = DR_STEP (dr);
2703   tree scalar_type = TREE_TYPE (DR_REF (dr));
2704   gimple *stmt = DR_STMT (dr);
2705   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2706   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2707   struct loop *loop = NULL;
2708
2709   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2710     return true;
2711
2712   if (loop_vinfo)
2713     loop = LOOP_VINFO_LOOP (loop_vinfo);
2714
2715   if (loop_vinfo && !step)
2716     {
2717       if (dump_enabled_p ())
2718         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2719                          "bad data-ref access in loop\n");
2720       return false;
2721     }
2722
2723   /* Allow loads with zero step in inner-loop vectorization.  */
2724   if (loop_vinfo && integer_zerop (step))
2725     {
2726       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2727       if (!nested_in_vect_loop_p (loop, stmt))
2728         return DR_IS_READ (dr);
2729       /* Allow references with zero step for outer loops marked
2730          with pragma omp simd only - it guarantees absence of
2731          loop-carried dependencies between inner loop iterations.  */
2732       if (loop->safelen < 2)
2733         {
2734           if (dump_enabled_p ())
2735             dump_printf_loc (MSG_NOTE, vect_location,
2736                              "zero step in inner loop of nest\n");
2737           return false;
2738         }
2739     }
2740
2741   if (loop && nested_in_vect_loop_p (loop, stmt))
2742     {
2743       /* Interleaved accesses are not yet supported within outer-loop
2744         vectorization for references in the inner-loop.  */
2745       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2746
2747       /* For the rest of the analysis we use the outer-loop step.  */
2748       step = STMT_VINFO_DR_STEP (stmt_info);
2749       if (integer_zerop (step))
2750         {
2751           if (dump_enabled_p ())
2752             dump_printf_loc (MSG_NOTE, vect_location,
2753                              "zero step in outer loop.\n");
2754           return DR_IS_READ (dr);
2755         }
2756     }
2757
2758   /* Consecutive?  */
2759   if (TREE_CODE (step) == INTEGER_CST)
2760     {
2761       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2762       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2763           || (dr_step < 0
2764               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2765         {
2766           /* Mark that it is not interleaving.  */
2767           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2768           return true;
2769         }
2770     }
2771
2772   if (loop && nested_in_vect_loop_p (loop, stmt))
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_NOTE, vect_location,
2776                          "grouped access in outer loop.\n");
2777       return false;
2778     }
2779
2780
2781   /* Assume this is a DR handled by non-constant strided load case.  */
2782   if (TREE_CODE (step) != INTEGER_CST)
2783     return (STMT_VINFO_STRIDED_P (stmt_info)
2784             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2785                 || vect_analyze_group_access (dr)));
2786
2787   /* Not consecutive access - check if it's a part of interleaving group.  */
2788   return vect_analyze_group_access (dr);
2789 }
2790
2791 /* Compare two data-references DRA and DRB to group them into chunks
2792    suitable for grouping.  */
2793
2794 static int
2795 dr_group_sort_cmp (const void *dra_, const void *drb_)
2796 {
2797   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2798   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2799   int cmp;
2800
2801   /* Stabilize sort.  */
2802   if (dra == drb)
2803     return 0;
2804
2805   /* DRs in different loops never belong to the same group.  */
2806   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2807   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2808   if (loopa != loopb)
2809     return loopa->num < loopb->num ? -1 : 1;
2810
2811   /* Ordering of DRs according to base.  */
2812   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2813                                DR_BASE_ADDRESS (drb));
2814   if (cmp != 0)
2815     return cmp;
2816
2817   /* And according to DR_OFFSET.  */
2818   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2819   if (cmp != 0)
2820     return cmp;
2821
2822   /* Put reads before writes.  */
2823   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2824     return DR_IS_READ (dra) ? -1 : 1;
2825
2826   /* Then sort after access size.  */
2827   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2828                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2829   if (cmp != 0)
2830     return cmp;
2831
2832   /* And after step.  */
2833   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2834   if (cmp != 0)
2835     return cmp;
2836
2837   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2838   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2839   if (cmp == 0)
2840     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2841   return cmp;
2842 }
2843
2844 /* If OP is the result of a conversion, return the unconverted value,
2845    otherwise return null.  */
2846
2847 static tree
2848 strip_conversion (tree op)
2849 {
2850   if (TREE_CODE (op) != SSA_NAME)
2851     return NULL_TREE;
2852   gimple *stmt = SSA_NAME_DEF_STMT (op);
2853   if (!is_gimple_assign (stmt)
2854       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2855     return NULL_TREE;
2856   return gimple_assign_rhs1 (stmt);
2857 }
2858
2859 /* Return true if vectorizable_* routines can handle statements STMT1
2860    and STMT2 being in a single group.  */
2861
2862 static bool
2863 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2864 {
2865   if (gimple_assign_single_p (stmt1))
2866     return gimple_assign_single_p (stmt2);
2867
2868   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2869     {
2870       /* Check for two masked loads or two masked stores.  */
2871       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2872         return false;
2873       internal_fn ifn = gimple_call_internal_fn (stmt1);
2874       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2875         return false;
2876       if (ifn != gimple_call_internal_fn (stmt2))
2877         return false;
2878
2879       /* Check that the masks are the same.  Cope with casts of masks,
2880          like those created by build_mask_conversion.  */
2881       tree mask1 = gimple_call_arg (stmt1, 2);
2882       tree mask2 = gimple_call_arg (stmt2, 2);
2883       if (!operand_equal_p (mask1, mask2, 0))
2884         {
2885           mask1 = strip_conversion (mask1);
2886           if (!mask1)
2887             return false;
2888           mask2 = strip_conversion (mask2);
2889           if (!mask2)
2890             return false;
2891           if (!operand_equal_p (mask1, mask2, 0))
2892             return false;
2893         }
2894       return true;
2895     }
2896
2897   return false;
2898 }
2899
2900 /* Function vect_analyze_data_ref_accesses.
2901
2902    Analyze the access pattern of all the data references in the loop.
2903
2904    FORNOW: the only access pattern that is considered vectorizable is a
2905            simple step 1 (consecutive) access.
2906
2907    FORNOW: handle only arrays and pointer accesses.  */
2908
2909 bool
2910 vect_analyze_data_ref_accesses (vec_info *vinfo)
2911 {
2912   unsigned int i;
2913   vec<data_reference_p> datarefs = vinfo->datarefs;
2914   struct data_reference *dr;
2915
2916   if (dump_enabled_p ())
2917     dump_printf_loc (MSG_NOTE, vect_location,
2918                      "=== vect_analyze_data_ref_accesses ===\n");
2919
2920   if (datarefs.is_empty ())
2921     return true;
2922
2923   /* Sort the array of datarefs to make building the interleaving chains
2924      linear.  Don't modify the original vector's order, it is needed for
2925      determining what dependencies are reversed.  */
2926   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2927   datarefs_copy.qsort (dr_group_sort_cmp);
2928
2929   /* Build the interleaving chains.  */
2930   for (i = 0; i < datarefs_copy.length () - 1;)
2931     {
2932       data_reference_p dra = datarefs_copy[i];
2933       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2934       stmt_vec_info lastinfo = NULL;
2935       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2936           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2937         {
2938           ++i;
2939           continue;
2940         }
2941       for (i = i + 1; i < datarefs_copy.length (); ++i)
2942         {
2943           data_reference_p drb = datarefs_copy[i];
2944           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2945           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2946               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2947             break;
2948
2949           /* ???  Imperfect sorting (non-compatible types, non-modulo
2950              accesses, same accesses) can lead to a group to be artificially
2951              split here as we don't just skip over those.  If it really
2952              matters we can push those to a worklist and re-iterate
2953              over them.  The we can just skip ahead to the next DR here.  */
2954
2955           /* DRs in a different loop should not be put into the same
2956              interleaving group.  */
2957           if (gimple_bb (DR_STMT (dra))->loop_father
2958               != gimple_bb (DR_STMT (drb))->loop_father)
2959             break;
2960
2961           /* Check that the data-refs have same first location (except init)
2962              and they are both either store or load (not load and store,
2963              not masked loads or stores).  */
2964           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2965               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2966                                         DR_BASE_ADDRESS (drb)) != 0
2967               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2968               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2969             break;
2970
2971           /* Check that the data-refs have the same constant size.  */
2972           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2973           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2974           if (!tree_fits_uhwi_p (sza)
2975               || !tree_fits_uhwi_p (szb)
2976               || !tree_int_cst_equal (sza, szb))
2977             break;
2978
2979           /* Check that the data-refs have the same step.  */
2980           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2981             break;
2982
2983           /* Check the types are compatible.
2984              ???  We don't distinguish this during sorting.  */
2985           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2986                                    TREE_TYPE (DR_REF (drb))))
2987             break;
2988
2989           /* Check that the DR_INITs are compile-time constants.  */
2990           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2991               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2992             break;
2993
2994           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2995           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2996           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2997           HOST_WIDE_INT init_prev
2998             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2999           gcc_assert (init_a <= init_b
3000                       && init_a <= init_prev
3001                       && init_prev <= init_b);
3002
3003           /* Do not place the same access in the interleaving chain twice.  */
3004           if (init_b == init_prev)
3005             {
3006               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3007                           < gimple_uid (DR_STMT (drb)));
3008               /* ???  For now we simply "drop" the later reference which is
3009                  otherwise the same rather than finishing off this group.
3010                  In the end we'd want to re-process duplicates forming
3011                  multiple groups from the refs, likely by just collecting
3012                  all candidates (including duplicates and split points
3013                  below) in a vector and then process them together.  */
3014               continue;
3015             }
3016
3017           /* If init_b == init_a + the size of the type * k, we have an
3018              interleaving, and DRA is accessed before DRB.  */
3019           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3020           if (type_size_a == 0
3021               || (init_b - init_a) % type_size_a != 0)
3022             break;
3023
3024           /* If we have a store, the accesses are adjacent.  This splits
3025              groups into chunks we support (we don't support vectorization
3026              of stores with gaps).  */
3027           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3028             break;
3029
3030           /* If the step (if not zero or non-constant) is greater than the
3031              difference between data-refs' inits this splits groups into
3032              suitable sizes.  */
3033           if (tree_fits_shwi_p (DR_STEP (dra)))
3034             {
3035               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3036               if (step != 0 && step <= (init_b - init_a))
3037                 break;
3038             }
3039
3040           if (dump_enabled_p ())
3041             {
3042               dump_printf_loc (MSG_NOTE, vect_location,
3043                                "Detected interleaving ");
3044               if (DR_IS_READ (dra))
3045                 dump_printf (MSG_NOTE, "load ");
3046               else
3047                 dump_printf (MSG_NOTE, "store ");
3048               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3049               dump_printf (MSG_NOTE,  " and ");
3050               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3051               dump_printf (MSG_NOTE, "\n");
3052             }
3053
3054           /* Link the found element into the group list.  */
3055           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3056             {
3057               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3058               lastinfo = stmtinfo_a;
3059             }
3060           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3061           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3062           lastinfo = stmtinfo_b;
3063         }
3064     }
3065
3066   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3067     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3068         && !vect_analyze_data_ref_access (dr))
3069       {
3070         if (dump_enabled_p ())
3071           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3072                            "not vectorized: complicated access pattern.\n");
3073
3074         if (is_a <bb_vec_info> (vinfo))
3075           {
3076             /* Mark the statement as not vectorizable.  */
3077             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3078             continue;
3079           }
3080         else
3081           {
3082             datarefs_copy.release ();
3083             return false;
3084           }
3085       }
3086
3087   datarefs_copy.release ();
3088   return true;
3089 }
3090
3091 /* Function vect_vfa_segment_size.
3092
3093    Input:
3094      DR: The data reference.
3095      LENGTH_FACTOR: segment length to consider.
3096
3097    Return a value suitable for the dr_with_seg_len::seg_len field.
3098    This is the "distance travelled" by the pointer from the first
3099    iteration in the segment to the last.  Note that it does not include
3100    the size of the access; in effect it only describes the first byte.  */
3101
3102 static tree
3103 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3104 {
3105   length_factor = size_binop (MINUS_EXPR,
3106                               fold_convert (sizetype, length_factor),
3107                               size_one_node);
3108   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3109                      length_factor);
3110 }
3111
3112 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3113    gives the worst-case number of bytes covered by the segment.  */
3114
3115 static unsigned HOST_WIDE_INT
3116 vect_vfa_access_size (data_reference *dr)
3117 {
3118   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3119   tree ref_type = TREE_TYPE (DR_REF (dr));
3120   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3121   unsigned HOST_WIDE_INT access_size = ref_size;
3122   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3123     {
3124       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3125       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3126     }
3127   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3128       && (vect_supportable_dr_alignment (dr, false)
3129           == dr_explicit_realign_optimized))
3130     {
3131       /* We might access a full vector's worth.  */
3132       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3133       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3134     }
3135   return access_size;
3136 }
3137
3138 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3139
3140 static unsigned int
3141 vect_vfa_align (const data_reference *dr)
3142 {
3143   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3144 }
3145
3146 /* Function vect_no_alias_p.
3147
3148    Given data references A and B with equal base and offset, see whether
3149    the alias relation can be decided at compilation time.  Return 1 if
3150    it can and the references alias, 0 if it can and the references do
3151    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3152    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3153    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3154
3155 static int
3156 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3157                          tree segment_length_a, tree segment_length_b,
3158                          unsigned HOST_WIDE_INT access_size_a,
3159                          unsigned HOST_WIDE_INT access_size_b)
3160 {
3161   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3162   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3163   poly_uint64 const_length_a;
3164   poly_uint64 const_length_b;
3165
3166   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3167      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3168      [a, a+12) */
3169   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3170     {
3171       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3172       offset_a = (offset_a + access_size_a) - const_length_a;
3173     }
3174   else
3175     const_length_a = tree_to_poly_uint64 (segment_length_a);
3176   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3177     {
3178       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3179       offset_b = (offset_b + access_size_b) - const_length_b;
3180     }
3181   else
3182     const_length_b = tree_to_poly_uint64 (segment_length_b);
3183
3184   const_length_a += access_size_a;
3185   const_length_b += access_size_b;
3186
3187   if (ranges_known_overlap_p (offset_a, const_length_a,
3188                               offset_b, const_length_b))
3189     return 1;
3190
3191   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3192                                offset_b, const_length_b))
3193     return 0;
3194
3195   return -1;
3196 }
3197
3198 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3199    in DDR is >= VF.  */
3200
3201 static bool
3202 dependence_distance_ge_vf (data_dependence_relation *ddr,
3203                            unsigned int loop_depth, poly_uint64 vf)
3204 {
3205   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3206       || DDR_NUM_DIST_VECTS (ddr) == 0)
3207     return false;
3208
3209   /* If the dependence is exact, we should have limited the VF instead.  */
3210   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3211
3212   unsigned int i;
3213   lambda_vector dist_v;
3214   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3215     {
3216       HOST_WIDE_INT dist = dist_v[loop_depth];
3217       if (dist != 0
3218           && !(dist > 0 && DDR_REVERSED_P (ddr))
3219           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3220         return false;
3221     }
3222
3223   if (dump_enabled_p ())
3224     {
3225       dump_printf_loc (MSG_NOTE, vect_location,
3226                        "dependence distance between ");
3227       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3228       dump_printf (MSG_NOTE,  " and ");
3229       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3230       dump_printf (MSG_NOTE,  " is >= VF\n");
3231     }
3232
3233   return true;
3234 }
3235
3236 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3237
3238 static void
3239 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3240 {
3241   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3242   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3243   dump_printf (dump_kind, ") >= ");
3244   dump_dec (dump_kind, lower_bound.min_value);
3245 }
3246
3247 /* Record that the vectorized loop requires the vec_lower_bound described
3248    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3249
3250 static void
3251 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3252                         poly_uint64 min_value)
3253 {
3254   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3255   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3256     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3257       {
3258         unsigned_p &= lower_bounds[i].unsigned_p;
3259         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3260         if (lower_bounds[i].unsigned_p != unsigned_p
3261             || maybe_lt (lower_bounds[i].min_value, min_value))
3262           {
3263             lower_bounds[i].unsigned_p = unsigned_p;
3264             lower_bounds[i].min_value = min_value;
3265             if (dump_enabled_p ())
3266               {
3267                 dump_printf_loc (MSG_NOTE, vect_location,
3268                                  "updating run-time check to ");
3269                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3270                 dump_printf (MSG_NOTE, "\n");
3271               }
3272           }
3273         return;
3274       }
3275
3276   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3277   if (dump_enabled_p ())
3278     {
3279       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3280       dump_lower_bound (MSG_NOTE, lower_bound);
3281       dump_printf (MSG_NOTE, "\n");
3282     }
3283   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3284 }
3285
3286 /* Return true if it's unlikely that the step of the vectorized form of DR
3287    will span fewer than GAP bytes.  */
3288
3289 static bool
3290 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3291 {
3292   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3293   HOST_WIDE_INT count
3294     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3295   if (GROUP_FIRST_ELEMENT (stmt_info))
3296     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3297   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3298 }
3299
3300 /* Return true if we know that there is no alias between DR_A and DR_B
3301    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3302    *LOWER_BOUND_OUT to this N.  */
3303
3304 static bool
3305 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3306                                 poly_uint64 *lower_bound_out)
3307 {
3308   /* Check that there is a constant gap of known sign between DR_A
3309      and DR_B.  */
3310   poly_int64 init_a, init_b;
3311   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3312       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3313       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3314       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3315       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3316       || !ordered_p (init_a, init_b))
3317     return false;
3318
3319   /* Sort DR_A and DR_B by the address they access.  */
3320   if (maybe_lt (init_b, init_a))
3321     {
3322       std::swap (init_a, init_b);
3323       std::swap (dr_a, dr_b);
3324     }
3325
3326   /* If the two accesses could be dependent within a scalar iteration,
3327      make sure that we'd retain their order.  */
3328   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3329       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3330     return false;
3331
3332   /* There is no alias if abs (DR_STEP) is greater than or equal to
3333      the bytes spanned by the combination of the two accesses.  */
3334   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3335   return true;
3336 }
3337
3338 /* Function vect_prune_runtime_alias_test_list.
3339
3340    Prune a list of ddrs to be tested at run-time by versioning for alias.
3341    Merge several alias checks into one if possible.
3342    Return FALSE if resulting list of ddrs is longer then allowed by
3343    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3344
3345 bool
3346 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3347 {
3348   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3349   hash_set <tree_pair_hash> compared_objects;
3350
3351   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3352   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3353     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3354   vec<vec_object_pair> &check_unequal_addrs
3355     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3356   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3357   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3358
3359   ddr_p ddr;
3360   unsigned int i;
3361   tree length_factor;
3362
3363   if (dump_enabled_p ())
3364     dump_printf_loc (MSG_NOTE, vect_location,
3365                      "=== vect_prune_runtime_alias_test_list ===\n");
3366
3367   /* Step values are irrelevant for aliasing if the number of vector
3368      iterations is equal to the number of scalar iterations (which can
3369      happen for fully-SLP loops).  */
3370   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3371
3372   if (!ignore_step_p)
3373     {
3374       /* Convert the checks for nonzero steps into bound tests.  */
3375       tree value;
3376       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3377         vect_check_lower_bound (loop_vinfo, value, true, 1);
3378     }
3379
3380   if (may_alias_ddrs.is_empty ())
3381     return true;
3382
3383   comp_alias_ddrs.create (may_alias_ddrs.length ());
3384
3385   unsigned int loop_depth
3386     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3387                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3388
3389   /* First, we collect all data ref pairs for aliasing checks.  */
3390   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3391     {
3392       int comp_res;
3393       poly_uint64 lower_bound;
3394       struct data_reference *dr_a, *dr_b;
3395       gimple *dr_group_first_a, *dr_group_first_b;
3396       tree segment_length_a, segment_length_b;
3397       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3398       unsigned int align_a, align_b;
3399       gimple *stmt_a, *stmt_b;
3400
3401       /* Ignore the alias if the VF we chose ended up being no greater
3402          than the dependence distance.  */
3403       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3404         continue;
3405
3406       if (DDR_OBJECT_A (ddr))
3407         {
3408           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3409           if (!compared_objects.add (new_pair))
3410             {
3411               if (dump_enabled_p ())
3412                 {
3413                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3414                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3415                   dump_printf (MSG_NOTE, " and ");
3416                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3417                   dump_printf (MSG_NOTE, " have different addresses\n");
3418                 }
3419               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3420             }
3421           continue;
3422         }
3423
3424       dr_a = DDR_A (ddr);
3425       stmt_a = DR_STMT (DDR_A (ddr));
3426
3427       dr_b = DDR_B (ddr);
3428       stmt_b = DR_STMT (DDR_B (ddr));
3429
3430       /* Skip the pair if inter-iteration dependencies are irrelevant
3431          and intra-iteration dependencies are guaranteed to be honored.  */
3432       if (ignore_step_p
3433           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3434               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3435         {
3436           if (dump_enabled_p ())
3437             {
3438               dump_printf_loc (MSG_NOTE, vect_location,
3439                                "no need for alias check between ");
3440               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3441               dump_printf (MSG_NOTE, " and ");
3442               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3443               dump_printf (MSG_NOTE, " when VF is 1\n");
3444             }
3445           continue;
3446         }
3447
3448       /* See whether we can handle the alias using a bounds check on
3449          the step, and whether that's likely to be the best approach.
3450          (It might not be, for example, if the minimum step is much larger
3451          than the number of bytes handled by one vector iteration.)  */
3452       if (!ignore_step_p
3453           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3454           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3455           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3456               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3457         {
3458           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3459           if (dump_enabled_p ())
3460             {
3461               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3462               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3463               dump_printf (MSG_NOTE, " and ");
3464               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3465               dump_printf (MSG_NOTE, " when the step ");
3466               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3467               dump_printf (MSG_NOTE, " is outside ");
3468               if (unsigned_p)
3469                 dump_printf (MSG_NOTE, "[0");
3470               else
3471                 {
3472                   dump_printf (MSG_NOTE, "(");
3473                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3474                 }
3475               dump_printf (MSG_NOTE, ", ");
3476               dump_dec (MSG_NOTE, lower_bound);
3477               dump_printf (MSG_NOTE, ")\n");
3478             }
3479           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3480                                   lower_bound);
3481           continue;
3482         }
3483
3484       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3485       if (dr_group_first_a)
3486         {
3487           stmt_a = dr_group_first_a;
3488           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3489         }
3490
3491       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3492       if (dr_group_first_b)
3493         {
3494           stmt_b = dr_group_first_b;
3495           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3496         }
3497
3498       if (ignore_step_p)
3499         {
3500           segment_length_a = size_zero_node;
3501           segment_length_b = size_zero_node;
3502         }
3503       else
3504         {
3505           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3506             length_factor = scalar_loop_iters;
3507           else
3508             length_factor = size_int (vect_factor);
3509           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3510           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3511         }
3512       access_size_a = vect_vfa_access_size (dr_a);
3513       access_size_b = vect_vfa_access_size (dr_b);
3514       align_a = vect_vfa_align (dr_a);
3515       align_b = vect_vfa_align (dr_b);
3516
3517       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3518                                         DR_BASE_ADDRESS (dr_b));
3519       if (comp_res == 0)
3520         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3521                                           DR_OFFSET (dr_b));
3522
3523       /* See whether the alias is known at compilation time.  */
3524       if (comp_res == 0
3525           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3526           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3527           && poly_int_tree_p (segment_length_a)
3528           && poly_int_tree_p (segment_length_b))
3529         {
3530           int res = vect_compile_time_alias (dr_a, dr_b,
3531                                              segment_length_a,
3532                                              segment_length_b,
3533                                              access_size_a,
3534                                              access_size_b);
3535           if (res >= 0 && dump_enabled_p ())
3536             {
3537               dump_printf_loc (MSG_NOTE, vect_location,
3538                                "can tell at compile time that ");
3539               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3540               dump_printf (MSG_NOTE, " and ");
3541               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3542               if (res == 0)
3543                 dump_printf (MSG_NOTE, " do not alias\n");
3544               else
3545                 dump_printf (MSG_NOTE, " alias\n");
3546             }
3547
3548           if (res == 0)
3549             continue;
3550
3551           if (res == 1)
3552             {
3553               if (dump_enabled_p ())
3554                 dump_printf_loc (MSG_NOTE, vect_location,
3555                                  "not vectorized: compilation time alias.\n");
3556               return false;
3557             }
3558         }
3559
3560       dr_with_seg_len_pair_t dr_with_seg_len_pair
3561         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3562          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3563
3564       /* Canonicalize pairs by sorting the two DR members.  */
3565       if (comp_res > 0)
3566         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3567
3568       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3569     }
3570
3571   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3572
3573   unsigned int count = (comp_alias_ddrs.length ()
3574                         + check_unequal_addrs.length ());
3575
3576   dump_printf_loc (MSG_NOTE, vect_location,
3577                    "improved number of alias checks from %d to %d\n",
3578                    may_alias_ddrs.length (), count);
3579   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3580     {
3581       if (dump_enabled_p ())
3582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3583                          "number of versioning for alias "
3584                          "run-time tests exceeds %d "
3585                          "(--param vect-max-version-for-alias-checks)\n",
3586                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3587       return false;
3588     }
3589
3590   return true;
3591 }
3592
3593 /* Check whether we can use an internal function for a gather load
3594    or scatter store.  READ_P is true for loads and false for stores.
3595    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3596    the type of the memory elements being loaded or stored.  OFFSET_BITS
3597    is the number of bits in each scalar offset and OFFSET_SIGN is the
3598    sign of the offset.  SCALE is the amount by which the offset should
3599    be multiplied *after* it has been converted to address width.
3600
3601    Return true if the function is supported, storing the function
3602    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3603
3604 bool
3605 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3606                           tree memory_type, unsigned int offset_bits,
3607                           signop offset_sign, int scale,
3608                           internal_fn *ifn_out, tree *element_type_out)
3609 {
3610   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3611   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3612   if (offset_bits > element_bits)
3613     /* Internal functions require the offset to be the same width as
3614        the vector elements.  We can extend narrower offsets, but it isn't
3615        safe to truncate wider offsets.  */
3616     return false;
3617
3618   if (element_bits != memory_bits)
3619     /* For now the vector elements must be the same width as the
3620        memory elements.  */
3621     return false;
3622
3623   /* Work out which function we need.  */
3624   internal_fn ifn;
3625   if (read_p)
3626     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3627   else
3628     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3629
3630   /* Test whether the target supports this combination.  */
3631   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3632                                                offset_sign, scale))
3633     return false;
3634
3635   *ifn_out = ifn;
3636   *element_type_out = TREE_TYPE (vectype);
3637   return true;
3638 }
3639
3640 /* CALL is a call to an internal gather load or scatter store function.
3641    Describe the operation in INFO.  */
3642
3643 static void
3644 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3645 {
3646   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3647   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3648   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3649
3650   info->ifn = gimple_call_internal_fn (call);
3651   info->decl = NULL_TREE;
3652   info->base = gimple_call_arg (call, 0);
3653   info->offset = gimple_call_arg (call, 1);
3654   info->offset_dt = vect_unknown_def_type;
3655   info->offset_vectype = NULL_TREE;
3656   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3657   info->element_type = TREE_TYPE (vectype);
3658   info->memory_type = TREE_TYPE (DR_REF (dr));
3659 }
3660
3661 /* Return true if a non-affine read or write in STMT is suitable for a
3662    gather load or scatter store.  Describe the operation in *INFO if so.  */
3663
3664 bool
3665 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3666                            gather_scatter_info *info)
3667 {
3668   HOST_WIDE_INT scale = 1;
3669   poly_int64 pbitpos, pbitsize;
3670   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3671   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3672   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3673   tree offtype = NULL_TREE;
3674   tree decl = NULL_TREE, base, off;
3675   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3676   tree memory_type = TREE_TYPE (DR_REF (dr));
3677   machine_mode pmode;
3678   int punsignedp, reversep, pvolatilep = 0;
3679   internal_fn ifn;
3680   tree element_type;
3681   bool masked_p = false;
3682
3683   /* See whether this is already a call to a gather/scatter internal function.
3684      If not, see whether it's a masked load or store.  */
3685   gcall *call = dyn_cast <gcall *> (stmt);
3686   if (call && gimple_call_internal_p (call))
3687     {
3688       ifn = gimple_call_internal_fn (stmt);
3689       if (internal_gather_scatter_fn_p (ifn))
3690         {
3691           vect_describe_gather_scatter_call (call, info);
3692           return true;
3693         }
3694       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3695     }
3696
3697   /* True if we should aim to use internal functions rather than
3698      built-in functions.  */
3699   bool use_ifn_p = (DR_IS_READ (dr)
3700                     ? supports_vec_gather_load_p ()
3701                     : supports_vec_scatter_store_p ());
3702
3703   base = DR_REF (dr);
3704   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3705      see if we can use the def stmt of the address.  */
3706   if (masked_p
3707       && TREE_CODE (base) == MEM_REF
3708       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3709       && integer_zerop (TREE_OPERAND (base, 1))
3710       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3711     {
3712       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3713       if (is_gimple_assign (def_stmt)
3714           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3715         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3716     }
3717
3718   /* The gather and scatter builtins need address of the form
3719      loop_invariant + vector * {1, 2, 4, 8}
3720      or
3721      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3722      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3723      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3724      multiplications and additions in it.  To get a vector, we need
3725      a single SSA_NAME that will be defined in the loop and will
3726      contain everything that is not loop invariant and that can be
3727      vectorized.  The following code attempts to find such a preexistng
3728      SSA_NAME OFF and put the loop invariants into a tree BASE
3729      that can be gimplified before the loop.  */
3730   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3731                               &punsignedp, &reversep, &pvolatilep);
3732   gcc_assert (base && !reversep);
3733   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3734
3735   if (TREE_CODE (base) == MEM_REF)
3736     {
3737       if (!integer_zerop (TREE_OPERAND (base, 1)))
3738         {
3739           if (off == NULL_TREE)
3740             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3741           else
3742             off = size_binop (PLUS_EXPR, off,
3743                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3744         }
3745       base = TREE_OPERAND (base, 0);
3746     }
3747   else
3748     base = build_fold_addr_expr (base);
3749
3750   if (off == NULL_TREE)
3751     off = size_zero_node;
3752
3753   /* If base is not loop invariant, either off is 0, then we start with just
3754      the constant offset in the loop invariant BASE and continue with base
3755      as OFF, otherwise give up.
3756      We could handle that case by gimplifying the addition of base + off
3757      into some SSA_NAME and use that as off, but for now punt.  */
3758   if (!expr_invariant_in_loop_p (loop, base))
3759     {
3760       if (!integer_zerop (off))
3761         return false;
3762       off = base;
3763       base = size_int (pbytepos);
3764     }
3765   /* Otherwise put base + constant offset into the loop invariant BASE
3766      and continue with OFF.  */
3767   else
3768     {
3769       base = fold_convert (sizetype, base);
3770       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3771     }
3772
3773   /* OFF at this point may be either a SSA_NAME or some tree expression
3774      from get_inner_reference.  Try to peel off loop invariants from it
3775      into BASE as long as possible.  */
3776   STRIP_NOPS (off);
3777   while (offtype == NULL_TREE)
3778     {
3779       enum tree_code code;
3780       tree op0, op1, add = NULL_TREE;
3781
3782       if (TREE_CODE (off) == SSA_NAME)
3783         {
3784           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3785
3786           if (expr_invariant_in_loop_p (loop, off))
3787             return false;
3788
3789           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3790             break;
3791
3792           op0 = gimple_assign_rhs1 (def_stmt);
3793           code = gimple_assign_rhs_code (def_stmt);
3794           op1 = gimple_assign_rhs2 (def_stmt);
3795         }
3796       else
3797         {
3798           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3799             return false;
3800           code = TREE_CODE (off);
3801           extract_ops_from_tree (off, &code, &op0, &op1);
3802         }
3803       switch (code)
3804         {
3805         case POINTER_PLUS_EXPR:
3806         case PLUS_EXPR:
3807           if (expr_invariant_in_loop_p (loop, op0))
3808             {
3809               add = op0;
3810               off = op1;
3811             do_add:
3812               add = fold_convert (sizetype, add);
3813               if (scale != 1)
3814                 add = size_binop (MULT_EXPR, add, size_int (scale));
3815               base = size_binop (PLUS_EXPR, base, add);
3816               continue;
3817             }
3818           if (expr_invariant_in_loop_p (loop, op1))
3819             {
3820               add = op1;
3821               off = op0;
3822               goto do_add;
3823             }
3824           break;
3825         case MINUS_EXPR:
3826           if (expr_invariant_in_loop_p (loop, op1))
3827             {
3828               add = fold_convert (sizetype, op1);
3829               add = size_binop (MINUS_EXPR, size_zero_node, add);
3830               off = op0;
3831               goto do_add;
3832             }
3833           break;
3834         case MULT_EXPR:
3835           if (scale == 1 && tree_fits_shwi_p (op1))
3836             {
3837               int new_scale = tree_to_shwi (op1);
3838               /* Only treat this as a scaling operation if the target
3839                  supports it.  */
3840               if (use_ifn_p
3841                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3842                                                 vectype, memory_type, 1,
3843                                                 TYPE_SIGN (TREE_TYPE (op0)),
3844                                                 new_scale, &ifn,
3845                                                 &element_type))
3846                 break;
3847               scale = new_scale;
3848               off = op0;
3849               continue;
3850             }
3851           break;
3852         case SSA_NAME:
3853           off = op0;
3854           continue;
3855         CASE_CONVERT:
3856           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3857               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3858             break;
3859           if (TYPE_PRECISION (TREE_TYPE (op0))
3860               == TYPE_PRECISION (TREE_TYPE (off)))
3861             {
3862               off = op0;
3863               continue;
3864             }
3865
3866           /* The internal functions need the offset to be the same width
3867              as the elements of VECTYPE.  Don't include operations that
3868              cast the offset from that width to a different width.  */
3869           if (use_ifn_p
3870               && (int_size_in_bytes (TREE_TYPE (vectype))
3871                   == int_size_in_bytes (TREE_TYPE (off))))
3872             break;
3873
3874           if (TYPE_PRECISION (TREE_TYPE (op0))
3875               < TYPE_PRECISION (TREE_TYPE (off)))
3876             {
3877               off = op0;
3878               offtype = TREE_TYPE (off);
3879               STRIP_NOPS (off);
3880               continue;
3881             }
3882           break;
3883         default:
3884           break;
3885         }
3886       break;
3887     }
3888
3889   /* If at the end OFF still isn't a SSA_NAME or isn't
3890      defined in the loop, punt.  */
3891   if (TREE_CODE (off) != SSA_NAME
3892       || expr_invariant_in_loop_p (loop, off))
3893     return false;
3894
3895   if (offtype == NULL_TREE)
3896     offtype = TREE_TYPE (off);
3897
3898   if (use_ifn_p)
3899     {
3900       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3901                                      memory_type, TYPE_PRECISION (offtype),
3902                                      TYPE_SIGN (offtype), scale, &ifn,
3903                                      &element_type))
3904         return false;
3905     }
3906   else
3907     {
3908       if (DR_IS_READ (dr))
3909         {
3910           if (targetm.vectorize.builtin_gather)
3911             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3912         }
3913       else
3914         {
3915           if (targetm.vectorize.builtin_scatter)
3916             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3917         }
3918
3919       if (!decl)
3920         return false;
3921
3922       ifn = IFN_LAST;
3923       element_type = TREE_TYPE (vectype);
3924     }
3925
3926   info->ifn = ifn;
3927   info->decl = decl;
3928   info->base = base;
3929   info->offset = off;
3930   info->offset_dt = vect_unknown_def_type;
3931   info->offset_vectype = NULL_TREE;
3932   info->scale = scale;
3933   info->element_type = element_type;
3934   info->memory_type = memory_type;
3935   return true;
3936 }
3937
3938 /* Function vect_analyze_data_refs.
3939
3940   Find all the data references in the loop or basic block.
3941
3942    The general structure of the analysis of data refs in the vectorizer is as
3943    follows:
3944    1- vect_analyze_data_refs(loop/bb): call
3945       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3946       in the loop/bb and their dependences.
3947    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3948    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3949    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3950
3951 */
3952
3953 bool
3954 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3955 {
3956   struct loop *loop = NULL;
3957   unsigned int i;
3958   struct data_reference *dr;
3959   tree scalar_type;
3960
3961   if (dump_enabled_p ())
3962     dump_printf_loc (MSG_NOTE, vect_location,
3963                      "=== vect_analyze_data_refs ===\n");
3964
3965   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3966     loop = LOOP_VINFO_LOOP (loop_vinfo);
3967
3968   /* Go through the data-refs, check that the analysis succeeded.  Update
3969      pointer from stmt_vec_info struct to DR and vectype.  */
3970
3971   vec<data_reference_p> datarefs = vinfo->datarefs;
3972   FOR_EACH_VEC_ELT (datarefs, i, dr)
3973     {
3974       gimple *stmt;
3975       stmt_vec_info stmt_info;
3976       tree base, offset, init;
3977       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3978       bool simd_lane_access = false;
3979       poly_uint64 vf;
3980
3981 again:
3982       if (!dr || !DR_REF (dr))
3983         {
3984           if (dump_enabled_p ())
3985             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3986                              "not vectorized: unhandled data-ref\n");
3987           return false;
3988         }
3989
3990       stmt = DR_STMT (dr);
3991       stmt_info = vinfo_for_stmt (stmt);
3992
3993       /* Discard clobbers from the dataref vector.  We will remove
3994          clobber stmts during vectorization.  */
3995       if (gimple_clobber_p (stmt))
3996         {
3997           free_data_ref (dr);
3998           if (i == datarefs.length () - 1)
3999             {
4000               datarefs.pop ();
4001               break;
4002             }
4003           datarefs.ordered_remove (i);
4004           dr = datarefs[i];
4005           goto again;
4006         }
4007
4008       /* Check that analysis of the data-ref succeeded.  */
4009       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4010           || !DR_STEP (dr))
4011         {
4012           bool maybe_gather
4013             = DR_IS_READ (dr)
4014               && !TREE_THIS_VOLATILE (DR_REF (dr))
4015               && (targetm.vectorize.builtin_gather != NULL
4016                   || supports_vec_gather_load_p ());
4017           bool maybe_scatter
4018             = DR_IS_WRITE (dr)
4019               && !TREE_THIS_VOLATILE (DR_REF (dr))
4020               && (targetm.vectorize.builtin_scatter != NULL
4021                   || supports_vec_scatter_store_p ());
4022           bool maybe_simd_lane_access
4023             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4024
4025           /* If target supports vector gather loads or scatter stores, or if
4026              this might be a SIMD lane access, see if they can't be used.  */
4027           if (is_a <loop_vec_info> (vinfo)
4028               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4029               && !nested_in_vect_loop_p (loop, stmt))
4030             {
4031               struct data_reference *newdr
4032                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4033                                    DR_REF (dr), stmt, !maybe_scatter,
4034                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4035               gcc_assert (newdr != NULL && DR_REF (newdr));
4036               if (DR_BASE_ADDRESS (newdr)
4037                   && DR_OFFSET (newdr)
4038                   && DR_INIT (newdr)
4039                   && DR_STEP (newdr)
4040                   && integer_zerop (DR_STEP (newdr)))
4041                 {
4042                   if (maybe_simd_lane_access)
4043                     {
4044                       tree off = DR_OFFSET (newdr);
4045                       STRIP_NOPS (off);
4046                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4047                           && TREE_CODE (off) == MULT_EXPR
4048                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4049                         {
4050                           tree step = TREE_OPERAND (off, 1);
4051                           off = TREE_OPERAND (off, 0);
4052                           STRIP_NOPS (off);
4053                           if (CONVERT_EXPR_P (off)
4054                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4055                                                                           0)))
4056                                  < TYPE_PRECISION (TREE_TYPE (off)))
4057                             off = TREE_OPERAND (off, 0);
4058                           if (TREE_CODE (off) == SSA_NAME)
4059                             {
4060                               gimple *def = SSA_NAME_DEF_STMT (off);
4061                               tree reft = TREE_TYPE (DR_REF (newdr));
4062                               if (is_gimple_call (def)
4063                                   && gimple_call_internal_p (def)
4064                                   && (gimple_call_internal_fn (def)
4065                                       == IFN_GOMP_SIMD_LANE))
4066                                 {
4067                                   tree arg = gimple_call_arg (def, 0);
4068                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4069                                   arg = SSA_NAME_VAR (arg);
4070                                   if (arg == loop->simduid
4071                                       /* For now.  */
4072                                       && tree_int_cst_equal
4073                                            (TYPE_SIZE_UNIT (reft),
4074                                             step))
4075                                     {
4076                                       DR_OFFSET (newdr) = ssize_int (0);
4077                                       DR_STEP (newdr) = step;
4078                                       DR_OFFSET_ALIGNMENT (newdr)
4079                                         = BIGGEST_ALIGNMENT;
4080                                       DR_STEP_ALIGNMENT (newdr)
4081                                         = highest_pow2_factor (step);
4082                                       dr = newdr;
4083                                       simd_lane_access = true;
4084                                     }
4085                                 }
4086                             }
4087                         }
4088                     }
4089                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4090                     {
4091                       dr = newdr;
4092                       if (maybe_gather)
4093                         gatherscatter = GATHER;
4094                       else
4095                         gatherscatter = SCATTER;
4096                     }
4097                 }
4098               if (gatherscatter == SG_NONE && !simd_lane_access)
4099                 free_data_ref (newdr);
4100             }
4101
4102           if (gatherscatter == SG_NONE && !simd_lane_access)
4103             {
4104               if (dump_enabled_p ())
4105                 {
4106                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4107                                    "not vectorized: data ref analysis "
4108                                    "failed ");
4109                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4110                 }
4111
4112               if (is_a <bb_vec_info> (vinfo))
4113                 break;
4114
4115               return false;
4116             }
4117         }
4118
4119       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4120         {
4121           if (dump_enabled_p ())
4122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4123                              "not vectorized: base addr of dr is a "
4124                              "constant\n");
4125
4126           if (is_a <bb_vec_info> (vinfo))
4127             break;
4128
4129           if (gatherscatter != SG_NONE || simd_lane_access)
4130             free_data_ref (dr);
4131           return false;
4132         }
4133
4134       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4135         {
4136           if (dump_enabled_p ())
4137             {
4138               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4139                                "not vectorized: volatile type ");
4140               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4141             }
4142
4143           if (is_a <bb_vec_info> (vinfo))
4144             break;
4145
4146           return false;
4147         }
4148
4149       if (stmt_can_throw_internal (stmt))
4150         {
4151           if (dump_enabled_p ())
4152             {
4153               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4154                                "not vectorized: statement can throw an "
4155                                "exception ");
4156               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4157             }
4158
4159           if (is_a <bb_vec_info> (vinfo))
4160             break;
4161
4162           if (gatherscatter != SG_NONE || simd_lane_access)
4163             free_data_ref (dr);
4164           return false;
4165         }
4166
4167       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4168           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4169         {
4170           if (dump_enabled_p ())
4171             {
4172               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4173                                "not vectorized: statement is bitfield "
4174                                "access ");
4175               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4176             }
4177
4178           if (is_a <bb_vec_info> (vinfo))
4179             break;
4180
4181           if (gatherscatter != SG_NONE || simd_lane_access)
4182             free_data_ref (dr);
4183           return false;
4184         }
4185
4186       base = unshare_expr (DR_BASE_ADDRESS (dr));
4187       offset = unshare_expr (DR_OFFSET (dr));
4188       init = unshare_expr (DR_INIT (dr));
4189
4190       if (is_gimple_call (stmt)
4191           && (!gimple_call_internal_p (stmt)
4192               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4193                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4194         {
4195           if (dump_enabled_p ())
4196             {
4197               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4198                                "not vectorized: dr in a call ");
4199               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4200             }
4201
4202           if (is_a <bb_vec_info> (vinfo))
4203             break;
4204
4205           if (gatherscatter != SG_NONE || simd_lane_access)
4206             free_data_ref (dr);
4207           return false;
4208         }
4209
4210       /* Update DR field in stmt_vec_info struct.  */
4211
4212       /* If the dataref is in an inner-loop of the loop that is considered for
4213          for vectorization, we also want to analyze the access relative to
4214          the outer-loop (DR contains information only relative to the
4215          inner-most enclosing loop).  We do that by building a reference to the
4216          first location accessed by the inner-loop, and analyze it relative to
4217          the outer-loop.  */
4218       if (loop && nested_in_vect_loop_p (loop, stmt))
4219         {
4220           /* Build a reference to the first location accessed by the
4221              inner loop: *(BASE + INIT + OFFSET).  By construction,
4222              this address must be invariant in the inner loop, so we
4223              can consider it as being used in the outer loop.  */
4224           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4225                                           init, offset);
4226           tree init_addr = fold_build_pointer_plus (base, init_offset);
4227           tree init_ref = build_fold_indirect_ref (init_addr);
4228
4229           if (dump_enabled_p ())
4230             {
4231               dump_printf_loc (MSG_NOTE, vect_location,
4232                                "analyze in outer loop: ");
4233               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4234               dump_printf (MSG_NOTE, "\n");
4235             }
4236
4237           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4238                                      init_ref, loop))
4239             /* dr_analyze_innermost already explained the failure.  */
4240             return false;
4241
4242           if (dump_enabled_p ())
4243             {
4244               dump_printf_loc (MSG_NOTE, vect_location,
4245                                "\touter base_address: ");
4246               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4247                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4248               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4249               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4250                                  STMT_VINFO_DR_OFFSET (stmt_info));
4251               dump_printf (MSG_NOTE,
4252                            "\n\touter constant offset from base address: ");
4253               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4254                                  STMT_VINFO_DR_INIT (stmt_info));
4255               dump_printf (MSG_NOTE, "\n\touter step: ");
4256               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4257                                  STMT_VINFO_DR_STEP (stmt_info));
4258               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4259                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4260               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4261                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4262               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4263                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4264               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4265                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4266             }
4267         }
4268
4269       if (STMT_VINFO_DATA_REF (stmt_info))
4270         {
4271           if (dump_enabled_p ())
4272             {
4273               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4274                                "not vectorized: more than one data ref "
4275                                "in stmt: ");
4276               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4277             }
4278
4279           if (is_a <bb_vec_info> (vinfo))
4280             break;
4281
4282           if (gatherscatter != SG_NONE || simd_lane_access)
4283             free_data_ref (dr);
4284           return false;
4285         }
4286
4287       STMT_VINFO_DATA_REF (stmt_info) = dr;
4288       if (simd_lane_access)
4289         {
4290           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4291           free_data_ref (datarefs[i]);
4292           datarefs[i] = dr;
4293         }
4294
4295       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4296           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4297           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4298         {
4299           if (dump_enabled_p ())
4300             {
4301               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4302                                "not vectorized: base object not addressable "
4303                                "for stmt: ");
4304               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4305             }
4306           if (is_a <bb_vec_info> (vinfo))
4307             {
4308               /* In BB vectorization the ref can still participate
4309                  in dependence analysis, we just can't vectorize it.  */
4310               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4311               continue;
4312             }
4313           return false;
4314         }
4315
4316       /* Set vectype for STMT.  */
4317       scalar_type = TREE_TYPE (DR_REF (dr));
4318       STMT_VINFO_VECTYPE (stmt_info)
4319         = get_vectype_for_scalar_type (scalar_type);
4320       if (!STMT_VINFO_VECTYPE (stmt_info))
4321         {
4322           if (dump_enabled_p ())
4323             {
4324               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4325                                "not vectorized: no vectype for stmt: ");
4326               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4327               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4328               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4329                                  scalar_type);
4330               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4331             }
4332
4333           if (is_a <bb_vec_info> (vinfo))
4334             {
4335               /* No vector type is fine, the ref can still participate
4336                  in dependence analysis, we just can't vectorize it.  */
4337               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4338               continue;
4339             }
4340
4341           if (gatherscatter != SG_NONE || simd_lane_access)
4342             {
4343               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4344               if (gatherscatter != SG_NONE)
4345                 free_data_ref (dr);
4346             }
4347           return false;
4348         }
4349       else
4350         {
4351           if (dump_enabled_p ())
4352             {
4353               dump_printf_loc (MSG_NOTE, vect_location,
4354                                "got vectype for stmt: ");
4355               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4356               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4357                                  STMT_VINFO_VECTYPE (stmt_info));
4358               dump_printf (MSG_NOTE, "\n");
4359             }
4360         }
4361
4362       /* Adjust the minimal vectorization factor according to the
4363          vector type.  */
4364       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4365       *min_vf = upper_bound (*min_vf, vf);
4366
4367       if (gatherscatter != SG_NONE)
4368         {
4369           gather_scatter_info gs_info;
4370           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4371                                           &gs_info)
4372               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4373             {
4374               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4375               free_data_ref (dr);
4376               if (dump_enabled_p ())
4377                 {
4378                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379                                    (gatherscatter == GATHER) ?
4380                                    "not vectorized: not suitable for gather "
4381                                    "load " :
4382                                    "not vectorized: not suitable for scatter "
4383                                    "store ");
4384                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4385                 }
4386               return false;
4387             }
4388
4389           free_data_ref (datarefs[i]);
4390           datarefs[i] = dr;
4391           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4392         }
4393
4394       else if (is_a <loop_vec_info> (vinfo)
4395                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4396         {
4397           if (nested_in_vect_loop_p (loop, stmt))
4398             {
4399               if (dump_enabled_p ())
4400                 {
4401                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4402                                    "not vectorized: not suitable for strided "
4403                                    "load ");
4404                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4405                 }
4406               return false;
4407             }
4408           STMT_VINFO_STRIDED_P (stmt_info) = true;
4409         }
4410     }
4411
4412   /* If we stopped analysis at the first dataref we could not analyze
4413      when trying to vectorize a basic-block mark the rest of the datarefs
4414      as not vectorizable and truncate the vector of datarefs.  That
4415      avoids spending useless time in analyzing their dependence.  */
4416   if (i != datarefs.length ())
4417     {
4418       gcc_assert (is_a <bb_vec_info> (vinfo));
4419       for (unsigned j = i; j < datarefs.length (); ++j)
4420         {
4421           data_reference_p dr = datarefs[j];
4422           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4423           free_data_ref (dr);
4424         }
4425       datarefs.truncate (i);
4426     }
4427
4428   return true;
4429 }
4430
4431
4432 /* Function vect_get_new_vect_var.
4433
4434    Returns a name for a new variable.  The current naming scheme appends the
4435    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4436    the name of vectorizer generated variables, and appends that to NAME if
4437    provided.  */
4438
4439 tree
4440 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4441 {
4442   const char *prefix;
4443   tree new_vect_var;
4444
4445   switch (var_kind)
4446   {
4447   case vect_simple_var:
4448     prefix = "vect";
4449     break;
4450   case vect_scalar_var:
4451     prefix = "stmp";
4452     break;
4453   case vect_mask_var:
4454     prefix = "mask";
4455     break;
4456   case vect_pointer_var:
4457     prefix = "vectp";
4458     break;
4459   default:
4460     gcc_unreachable ();
4461   }
4462
4463   if (name)
4464     {
4465       char* tmp = concat (prefix, "_", name, NULL);
4466       new_vect_var = create_tmp_reg (type, tmp);
4467       free (tmp);
4468     }
4469   else
4470     new_vect_var = create_tmp_reg (type, prefix);
4471
4472   return new_vect_var;
4473 }
4474
4475 /* Like vect_get_new_vect_var but return an SSA name.  */
4476
4477 tree
4478 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4479 {
4480   const char *prefix;
4481   tree new_vect_var;
4482
4483   switch (var_kind)
4484   {
4485   case vect_simple_var:
4486     prefix = "vect";
4487     break;
4488   case vect_scalar_var:
4489     prefix = "stmp";
4490     break;
4491   case vect_pointer_var:
4492     prefix = "vectp";
4493     break;
4494   default:
4495     gcc_unreachable ();
4496   }
4497
4498   if (name)
4499     {
4500       char* tmp = concat (prefix, "_", name, NULL);
4501       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4502       free (tmp);
4503     }
4504   else
4505     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4506
4507   return new_vect_var;
4508 }
4509
4510 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4511
4512 static void
4513 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4514 {
4515   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4516   int misalign = DR_MISALIGNMENT (dr);
4517   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4518     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4519   else
4520     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4521                             DR_TARGET_ALIGNMENT (dr), misalign);
4522 }
4523
4524 /* Function vect_create_addr_base_for_vector_ref.
4525
4526    Create an expression that computes the address of the first memory location
4527    that will be accessed for a data reference.
4528
4529    Input:
4530    STMT: The statement containing the data reference.
4531    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4532    OFFSET: Optional. If supplied, it is be added to the initial address.
4533    LOOP:    Specify relative to which loop-nest should the address be computed.
4534             For example, when the dataref is in an inner-loop nested in an
4535             outer-loop that is now being vectorized, LOOP can be either the
4536             outer-loop, or the inner-loop.  The first memory location accessed
4537             by the following dataref ('in' points to short):
4538
4539                 for (i=0; i<N; i++)
4540                    for (j=0; j<M; j++)
4541                      s += in[i+j]
4542
4543             is as follows:
4544             if LOOP=i_loop:     &in             (relative to i_loop)
4545             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4546    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4547             initial address.  Unlike OFFSET, which is number of elements to
4548             be added, BYTE_OFFSET is measured in bytes.
4549
4550    Output:
4551    1. Return an SSA_NAME whose value is the address of the memory location of
4552       the first vector of the data reference.
4553    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4554       these statement(s) which define the returned SSA_NAME.
4555
4556    FORNOW: We are only handling array accesses with step 1.  */
4557
4558 tree
4559 vect_create_addr_base_for_vector_ref (gimple *stmt,
4560                                       gimple_seq *new_stmt_list,
4561                                       tree offset,
4562                                       tree byte_offset)
4563 {
4564   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4565   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4566   const char *base_name;
4567   tree addr_base;
4568   tree dest;
4569   gimple_seq seq = NULL;
4570   tree vect_ptr_type;
4571   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4572   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4573   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4574
4575   tree data_ref_base = unshare_expr (drb->base_address);
4576   tree base_offset = unshare_expr (drb->offset);
4577   tree init = unshare_expr (drb->init);
4578
4579   if (loop_vinfo)
4580     base_name = get_name (data_ref_base);
4581   else
4582     {
4583       base_offset = ssize_int (0);
4584       init = ssize_int (0);
4585       base_name = get_name (DR_REF (dr));
4586     }
4587
4588   /* Create base_offset */
4589   base_offset = size_binop (PLUS_EXPR,
4590                             fold_convert (sizetype, base_offset),
4591                             fold_convert (sizetype, init));
4592
4593   if (offset)
4594     {
4595       offset = fold_build2 (MULT_EXPR, sizetype,
4596                             fold_convert (sizetype, offset), step);
4597       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4598                                  base_offset, offset);
4599     }
4600   if (byte_offset)
4601     {
4602       byte_offset = fold_convert (sizetype, byte_offset);
4603       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4604                                  base_offset, byte_offset);
4605     }
4606
4607   /* base + base_offset */
4608   if (loop_vinfo)
4609     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4610   else
4611     {
4612       addr_base = build1 (ADDR_EXPR,
4613                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4614                           unshare_expr (DR_REF (dr)));
4615     }
4616
4617   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4618   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4619   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4620   gimple_seq_add_seq (new_stmt_list, seq);
4621
4622   if (DR_PTR_INFO (dr)
4623       && TREE_CODE (addr_base) == SSA_NAME
4624       && !SSA_NAME_PTR_INFO (addr_base))
4625     {
4626       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4627       if (offset || byte_offset)
4628         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4629     }
4630
4631   if (dump_enabled_p ())
4632     {
4633       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4634       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4635       dump_printf (MSG_NOTE, "\n");
4636     }
4637
4638   return addr_base;
4639 }
4640
4641
4642 /* Function vect_create_data_ref_ptr.
4643
4644    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4645    location accessed in the loop by STMT, along with the def-use update
4646    chain to appropriately advance the pointer through the loop iterations.
4647    Also set aliasing information for the pointer.  This pointer is used by
4648    the callers to this function to create a memory reference expression for
4649    vector load/store access.
4650
4651    Input:
4652    1. STMT: a stmt that references memory. Expected to be of the form
4653          GIMPLE_ASSIGN <name, data-ref> or
4654          GIMPLE_ASSIGN <data-ref, name>.
4655    2. AGGR_TYPE: the type of the reference, which should be either a vector
4656         or an array.
4657    3. AT_LOOP: the loop where the vector memref is to be created.
4658    4. OFFSET (optional): an offset to be added to the initial address accessed
4659         by the data-ref in STMT.
4660    5. BSI: location where the new stmts are to be placed if there is no loop
4661    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4662         pointing to the initial address.
4663    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4664         to the initial address accessed by the data-ref in STMT.  This is
4665         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4666         in bytes.
4667    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4668         to the IV during each iteration of the loop.  NULL says to move
4669         by one copy of AGGR_TYPE up or down, depending on the step of the
4670         data reference.
4671
4672    Output:
4673    1. Declare a new ptr to vector_type, and have it point to the base of the
4674       data reference (initial addressed accessed by the data reference).
4675       For example, for vector of type V8HI, the following code is generated:
4676
4677       v8hi *ap;
4678       ap = (v8hi *)initial_address;
4679
4680       if OFFSET is not supplied:
4681          initial_address = &a[init];
4682       if OFFSET is supplied:
4683          initial_address = &a[init + OFFSET];
4684       if BYTE_OFFSET is supplied:
4685          initial_address = &a[init] + BYTE_OFFSET;
4686
4687       Return the initial_address in INITIAL_ADDRESS.
4688
4689    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4690       update the pointer in each iteration of the loop.
4691
4692       Return the increment stmt that updates the pointer in PTR_INCR.
4693
4694    3. Set INV_P to true if the access pattern of the data reference in the
4695       vectorized loop is invariant.  Set it to false otherwise.
4696
4697    4. Return the pointer.  */
4698
4699 tree
4700 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4701                           tree offset, tree *initial_address,
4702                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4703                           bool only_init, bool *inv_p, tree byte_offset,
4704                           tree iv_step)
4705 {
4706   const char *base_name;
4707   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4708   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4709   struct loop *loop = NULL;
4710   bool nested_in_vect_loop = false;
4711   struct loop *containing_loop = NULL;
4712   tree aggr_ptr_type;
4713   tree aggr_ptr;
4714   tree new_temp;
4715   gimple_seq new_stmt_list = NULL;
4716   edge pe = NULL;
4717   basic_block new_bb;
4718   tree aggr_ptr_init;
4719   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4720   tree aptr;
4721   gimple_stmt_iterator incr_gsi;
4722   bool insert_after;
4723   tree indx_before_incr, indx_after_incr;
4724   gimple *incr;
4725   tree step;
4726   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4727
4728   gcc_assert (iv_step != NULL_TREE
4729               || TREE_CODE (aggr_type) == ARRAY_TYPE
4730               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4731
4732   if (loop_vinfo)
4733     {
4734       loop = LOOP_VINFO_LOOP (loop_vinfo);
4735       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4736       containing_loop = (gimple_bb (stmt))->loop_father;
4737       pe = loop_preheader_edge (loop);
4738     }
4739   else
4740     {
4741       gcc_assert (bb_vinfo);
4742       only_init = true;
4743       *ptr_incr = NULL;
4744     }
4745
4746   /* Check the step (evolution) of the load in LOOP, and record
4747      whether it's invariant.  */
4748   step = vect_dr_behavior (dr)->step;
4749   if (integer_zerop (step))
4750     *inv_p = true;
4751   else
4752     *inv_p = false;
4753
4754   /* Create an expression for the first address accessed by this load
4755      in LOOP.  */
4756   base_name = get_name (DR_BASE_ADDRESS (dr));
4757
4758   if (dump_enabled_p ())
4759     {
4760       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4761       dump_printf_loc (MSG_NOTE, vect_location,
4762                        "create %s-pointer variable to type: ",
4763                        get_tree_code_name (TREE_CODE (aggr_type)));
4764       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4765       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4766         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4767       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4768         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4769       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4770         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4771       else
4772         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4773       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4774       dump_printf (MSG_NOTE, "\n");
4775     }
4776
4777   /* (1) Create the new aggregate-pointer variable.
4778      Vector and array types inherit the alias set of their component
4779      type by default so we need to use a ref-all pointer if the data
4780      reference does not conflict with the created aggregated data
4781      reference because it is not addressable.  */
4782   bool need_ref_all = false;
4783   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4784                               get_alias_set (DR_REF (dr))))
4785     need_ref_all = true;
4786   /* Likewise for any of the data references in the stmt group.  */
4787   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4788     {
4789       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4790       do
4791         {
4792           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4793           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4794           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4795                                       get_alias_set (DR_REF (sdr))))
4796             {
4797               need_ref_all = true;
4798               break;
4799             }
4800           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4801         }
4802       while (orig_stmt);
4803     }
4804   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4805                                                need_ref_all);
4806   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4807
4808
4809   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4810      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4811      def-use update cycles for the pointer: one relative to the outer-loop
4812      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4813      to the inner-loop (which is the inner-most loop containing the dataref),
4814      and this is done be step (5) below.
4815
4816      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4817      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4818      redundant.  Steps (3),(4) create the following:
4819
4820         vp0 = &base_addr;
4821         LOOP:   vp1 = phi(vp0,vp2)
4822                 ...
4823                 ...
4824                 vp2 = vp1 + step
4825                 goto LOOP
4826
4827      If there is an inner-loop nested in loop, then step (5) will also be
4828      applied, and an additional update in the inner-loop will be created:
4829
4830         vp0 = &base_addr;
4831         LOOP:   vp1 = phi(vp0,vp2)
4832                 ...
4833         inner:     vp3 = phi(vp1,vp4)
4834                    vp4 = vp3 + inner_step
4835                    if () goto inner
4836                 ...
4837                 vp2 = vp1 + step
4838                 if () goto LOOP   */
4839
4840   /* (2) Calculate the initial address of the aggregate-pointer, and set
4841      the aggregate-pointer to point to it before the loop.  */
4842
4843   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4844
4845   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4846                                                    offset, byte_offset);
4847   if (new_stmt_list)
4848     {
4849       if (pe)
4850         {
4851           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4852           gcc_assert (!new_bb);
4853         }
4854       else
4855         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4856     }
4857
4858   *initial_address = new_temp;
4859   aggr_ptr_init = new_temp;
4860
4861   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4862      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4863      inner-loop nested in LOOP (during outer-loop vectorization).  */
4864
4865   /* No update in loop is required.  */
4866   if (only_init && (!loop_vinfo || at_loop == loop))
4867     aptr = aggr_ptr_init;
4868   else
4869     {
4870       if (iv_step == NULL_TREE)
4871         {
4872           /* The step of the aggregate pointer is the type size.  */
4873           iv_step = TYPE_SIZE_UNIT (aggr_type);
4874           /* One exception to the above is when the scalar step of the load in
4875              LOOP is zero. In this case the step here is also zero.  */
4876           if (*inv_p)
4877             iv_step = size_zero_node;
4878           else if (tree_int_cst_sgn (step) == -1)
4879             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4880         }
4881
4882       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4883
4884       create_iv (aggr_ptr_init,
4885                  fold_convert (aggr_ptr_type, iv_step),
4886                  aggr_ptr, loop, &incr_gsi, insert_after,
4887                  &indx_before_incr, &indx_after_incr);
4888       incr = gsi_stmt (incr_gsi);
4889       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4890
4891       /* Copy the points-to information if it exists. */
4892       if (DR_PTR_INFO (dr))
4893         {
4894           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4895           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4896         }
4897       if (ptr_incr)
4898         *ptr_incr = incr;
4899
4900       aptr = indx_before_incr;
4901     }
4902
4903   if (!nested_in_vect_loop || only_init)
4904     return aptr;
4905
4906
4907   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4908      nested in LOOP, if exists.  */
4909
4910   gcc_assert (nested_in_vect_loop);
4911   if (!only_init)
4912     {
4913       standard_iv_increment_position (containing_loop, &incr_gsi,
4914                                       &insert_after);
4915       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4916                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4917                  &indx_after_incr);
4918       incr = gsi_stmt (incr_gsi);
4919       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4920
4921       /* Copy the points-to information if it exists. */
4922       if (DR_PTR_INFO (dr))
4923         {
4924           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4925           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4926         }
4927       if (ptr_incr)
4928         *ptr_incr = incr;
4929
4930       return indx_before_incr;
4931     }
4932   else
4933     gcc_unreachable ();
4934 }
4935
4936
4937 /* Function bump_vector_ptr
4938
4939    Increment a pointer (to a vector type) by vector-size. If requested,
4940    i.e. if PTR-INCR is given, then also connect the new increment stmt
4941    to the existing def-use update-chain of the pointer, by modifying
4942    the PTR_INCR as illustrated below:
4943
4944    The pointer def-use update-chain before this function:
4945                         DATAREF_PTR = phi (p_0, p_2)
4946                         ....
4947         PTR_INCR:       p_2 = DATAREF_PTR + step
4948
4949    The pointer def-use update-chain after this function:
4950                         DATAREF_PTR = phi (p_0, p_2)
4951                         ....
4952                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4953                         ....
4954         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4955
4956    Input:
4957    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4958                  in the loop.
4959    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4960               the loop.  The increment amount across iterations is expected
4961               to be vector_size.
4962    BSI - location where the new update stmt is to be placed.
4963    STMT - the original scalar memory-access stmt that is being vectorized.
4964    BUMP - optional. The offset by which to bump the pointer. If not given,
4965           the offset is assumed to be vector_size.
4966
4967    Output: Return NEW_DATAREF_PTR as illustrated above.
4968
4969 */
4970
4971 tree
4972 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4973                  gimple *stmt, tree bump)
4974 {
4975   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4976   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4977   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4978   tree update = TYPE_SIZE_UNIT (vectype);
4979   gassign *incr_stmt;
4980   ssa_op_iter iter;
4981   use_operand_p use_p;
4982   tree new_dataref_ptr;
4983
4984   if (bump)
4985     update = bump;
4986
4987   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4988     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4989   else
4990     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4991   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4992                                    dataref_ptr, update);
4993   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4994
4995   /* Copy the points-to information if it exists. */
4996   if (DR_PTR_INFO (dr))
4997     {
4998       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4999       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5000     }
5001
5002   if (!ptr_incr)
5003     return new_dataref_ptr;
5004
5005   /* Update the vector-pointer's cross-iteration increment.  */
5006   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5007     {
5008       tree use = USE_FROM_PTR (use_p);
5009
5010       if (use == dataref_ptr)
5011         SET_USE (use_p, new_dataref_ptr);
5012       else
5013         gcc_assert (operand_equal_p (use, update, 0));
5014     }
5015
5016   return new_dataref_ptr;
5017 }
5018
5019
5020 /* Copy memory reference info such as base/clique from the SRC reference
5021    to the DEST MEM_REF.  */
5022
5023 void
5024 vect_copy_ref_info (tree dest, tree src)
5025 {
5026   if (TREE_CODE (dest) != MEM_REF)
5027     return;
5028
5029   tree src_base = src;
5030   while (handled_component_p (src_base))
5031     src_base = TREE_OPERAND (src_base, 0);
5032   if (TREE_CODE (src_base) != MEM_REF
5033       && TREE_CODE (src_base) != TARGET_MEM_REF)
5034     return;
5035
5036   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5037   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5038 }
5039
5040
5041 /* Function vect_create_destination_var.
5042
5043    Create a new temporary of type VECTYPE.  */
5044
5045 tree
5046 vect_create_destination_var (tree scalar_dest, tree vectype)
5047 {
5048   tree vec_dest;
5049   const char *name;
5050   char *new_name;
5051   tree type;
5052   enum vect_var_kind kind;
5053
5054   kind = vectype
5055     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5056     ? vect_mask_var
5057     : vect_simple_var
5058     : vect_scalar_var;
5059   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5060
5061   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5062
5063   name = get_name (scalar_dest);
5064   if (name)
5065     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5066   else
5067     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5068   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5069   free (new_name);
5070
5071   return vec_dest;
5072 }
5073
5074 /* Function vect_grouped_store_supported.
5075
5076    Returns TRUE if interleave high and interleave low permutations
5077    are supported, and FALSE otherwise.  */
5078
5079 bool
5080 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5081 {
5082   machine_mode mode = TYPE_MODE (vectype);
5083
5084   /* vect_permute_store_chain requires the group size to be equal to 3 or
5085      be a power of two.  */
5086   if (count != 3 && exact_log2 (count) == -1)
5087     {
5088       if (dump_enabled_p ())
5089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5090                          "the size of the group of accesses"
5091                          " is not a power of 2 or not eqaul to 3\n");
5092       return false;
5093     }
5094
5095   /* Check that the permutation is supported.  */
5096   if (VECTOR_MODE_P (mode))
5097     {
5098       unsigned int i;
5099       if (count == 3)
5100         {
5101           unsigned int j0 = 0, j1 = 0, j2 = 0;
5102           unsigned int i, j;
5103
5104           unsigned int nelt;
5105           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5106             {
5107               if (dump_enabled_p ())
5108                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5109                                  "cannot handle groups of 3 stores for"
5110                                  " variable-length vectors\n");
5111               return false;
5112             }
5113
5114           vec_perm_builder sel (nelt, nelt, 1);
5115           sel.quick_grow (nelt);
5116           vec_perm_indices indices;
5117           for (j = 0; j < 3; j++)
5118             {
5119               int nelt0 = ((3 - j) * nelt) % 3;
5120               int nelt1 = ((3 - j) * nelt + 1) % 3;
5121               int nelt2 = ((3 - j) * nelt + 2) % 3;
5122               for (i = 0; i < nelt; i++)
5123                 {
5124                   if (3 * i + nelt0 < nelt)
5125                     sel[3 * i + nelt0] = j0++;
5126                   if (3 * i + nelt1 < nelt)
5127                     sel[3 * i + nelt1] = nelt + j1++;
5128                   if (3 * i + nelt2 < nelt)
5129                     sel[3 * i + nelt2] = 0;
5130                 }
5131               indices.new_vector (sel, 2, nelt);
5132               if (!can_vec_perm_const_p (mode, indices))
5133                 {
5134                   if (dump_enabled_p ())
5135                     dump_printf (MSG_MISSED_OPTIMIZATION,
5136                                  "permutation op not supported by target.\n");
5137                   return false;
5138                 }
5139
5140               for (i = 0; i < nelt; i++)
5141                 {
5142                   if (3 * i + nelt0 < nelt)
5143                     sel[3 * i + nelt0] = 3 * i + nelt0;
5144                   if (3 * i + nelt1 < nelt)
5145                     sel[3 * i + nelt1] = 3 * i + nelt1;
5146                   if (3 * i + nelt2 < nelt)
5147                     sel[3 * i + nelt2] = nelt + j2++;
5148                 }
5149               indices.new_vector (sel, 2, nelt);
5150               if (!can_vec_perm_const_p (mode, indices))
5151                 {
5152                   if (dump_enabled_p ())
5153                     dump_printf (MSG_MISSED_OPTIMIZATION,
5154                                  "permutation op not supported by target.\n");
5155                   return false;
5156                 }
5157             }
5158           return true;
5159         }
5160       else
5161         {
5162           /* If length is not equal to 3 then only power of 2 is supported.  */
5163           gcc_assert (pow2p_hwi (count));
5164           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5165
5166           /* The encoding has 2 interleaved stepped patterns.  */
5167           vec_perm_builder sel (nelt, 2, 3);
5168           sel.quick_grow (6);
5169           for (i = 0; i < 3; i++)
5170             {
5171               sel[i * 2] = i;
5172               sel[i * 2 + 1] = i + nelt;
5173             }
5174           vec_perm_indices indices (sel, 2, nelt);
5175           if (can_vec_perm_const_p (mode, indices))
5176             {
5177               for (i = 0; i < 6; i++)
5178                 sel[i] += exact_div (nelt, 2);
5179               indices.new_vector (sel, 2, nelt);
5180               if (can_vec_perm_const_p (mode, indices))
5181                 return true;
5182             }
5183         }
5184     }
5185
5186   if (dump_enabled_p ())
5187     dump_printf (MSG_MISSED_OPTIMIZATION,
5188                  "permutaion op not supported by target.\n");
5189   return false;
5190 }
5191
5192
5193 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5194    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5195
5196 bool
5197 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5198                             bool masked_p)
5199 {
5200   if (masked_p)
5201     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5202                                          vec_mask_store_lanes_optab,
5203                                          vectype, count);
5204   else
5205     return vect_lanes_optab_supported_p ("vec_store_lanes",
5206                                          vec_store_lanes_optab,
5207                                          vectype, count);
5208 }
5209
5210
5211 /* Function vect_permute_store_chain.
5212
5213    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5214    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5215    the data correctly for the stores.  Return the final references for stores
5216    in RESULT_CHAIN.
5217
5218    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5219    The input is 4 vectors each containing 8 elements.  We assign a number to
5220    each element, the input sequence is:
5221
5222    1st vec:   0  1  2  3  4  5  6  7
5223    2nd vec:   8  9 10 11 12 13 14 15
5224    3rd vec:  16 17 18 19 20 21 22 23
5225    4th vec:  24 25 26 27 28 29 30 31
5226
5227    The output sequence should be:
5228
5229    1st vec:  0  8 16 24  1  9 17 25
5230    2nd vec:  2 10 18 26  3 11 19 27
5231    3rd vec:  4 12 20 28  5 13 21 30
5232    4th vec:  6 14 22 30  7 15 23 31
5233
5234    i.e., we interleave the contents of the four vectors in their order.
5235
5236    We use interleave_high/low instructions to create such output.  The input of
5237    each interleave_high/low operation is two vectors:
5238    1st vec    2nd vec
5239    0 1 2 3    4 5 6 7
5240    the even elements of the result vector are obtained left-to-right from the
5241    high/low elements of the first vector.  The odd elements of the result are
5242    obtained left-to-right from the high/low elements of the second vector.
5243    The output of interleave_high will be:   0 4 1 5
5244    and of interleave_low:                   2 6 3 7
5245
5246
5247    The permutation is done in log LENGTH stages.  In each stage interleave_high
5248    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5249    where the first argument is taken from the first half of DR_CHAIN and the
5250    second argument from it's second half.
5251    In our example,
5252
5253    I1: interleave_high (1st vec, 3rd vec)
5254    I2: interleave_low (1st vec, 3rd vec)
5255    I3: interleave_high (2nd vec, 4th vec)
5256    I4: interleave_low (2nd vec, 4th vec)
5257
5258    The output for the first stage is:
5259
5260    I1:  0 16  1 17  2 18  3 19
5261    I2:  4 20  5 21  6 22  7 23
5262    I3:  8 24  9 25 10 26 11 27
5263    I4: 12 28 13 29 14 30 15 31
5264
5265    The output of the second stage, i.e. the final result is:
5266
5267    I1:  0  8 16 24  1  9 17 25
5268    I2:  2 10 18 26  3 11 19 27
5269    I3:  4 12 20 28  5 13 21 30
5270    I4:  6 14 22 30  7 15 23 31.  */
5271
5272 void
5273 vect_permute_store_chain (vec<tree> dr_chain,
5274                           unsigned int length,
5275                           gimple *stmt,
5276                           gimple_stmt_iterator *gsi,
5277                           vec<tree> *result_chain)
5278 {
5279   tree vect1, vect2, high, low;
5280   gimple *perm_stmt;
5281   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5282   tree perm_mask_low, perm_mask_high;
5283   tree data_ref;
5284   tree perm3_mask_low, perm3_mask_high;
5285   unsigned int i, j, n, log_length = exact_log2 (length);
5286
5287   result_chain->quick_grow (length);
5288   memcpy (result_chain->address (), dr_chain.address (),
5289           length * sizeof (tree));
5290
5291   if (length == 3)
5292     {
5293       /* vect_grouped_store_supported ensures that this is constant.  */
5294       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5295       unsigned int j0 = 0, j1 = 0, j2 = 0;
5296
5297       vec_perm_builder sel (nelt, nelt, 1);
5298       sel.quick_grow (nelt);
5299       vec_perm_indices indices;
5300       for (j = 0; j < 3; j++)
5301         {
5302           int nelt0 = ((3 - j) * nelt) % 3;
5303           int nelt1 = ((3 - j) * nelt + 1) % 3;
5304           int nelt2 = ((3 - j) * nelt + 2) % 3;
5305
5306           for (i = 0; i < nelt; i++)
5307             {
5308               if (3 * i + nelt0 < nelt)
5309                 sel[3 * i + nelt0] = j0++;
5310               if (3 * i + nelt1 < nelt)
5311                 sel[3 * i + nelt1] = nelt + j1++;
5312               if (3 * i + nelt2 < nelt)
5313                 sel[3 * i + nelt2] = 0;
5314             }
5315           indices.new_vector (sel, 2, nelt);
5316           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5317
5318           for (i = 0; i < nelt; i++)
5319             {
5320               if (3 * i + nelt0 < nelt)
5321                 sel[3 * i + nelt0] = 3 * i + nelt0;
5322               if (3 * i + nelt1 < nelt)
5323                 sel[3 * i + nelt1] = 3 * i + nelt1;
5324               if (3 * i + nelt2 < nelt)
5325                 sel[3 * i + nelt2] = nelt + j2++;
5326             }
5327           indices.new_vector (sel, 2, nelt);
5328           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5329
5330           vect1 = dr_chain[0];
5331           vect2 = dr_chain[1];
5332
5333           /* Create interleaving stmt:
5334              low = VEC_PERM_EXPR <vect1, vect2,
5335                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5336                                    j + 2, nelt + j + 2, *, ...}>  */
5337           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5338           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5339                                            vect2, perm3_mask_low);
5340           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5341
5342           vect1 = data_ref;
5343           vect2 = dr_chain[2];
5344           /* Create interleaving stmt:
5345              low = VEC_PERM_EXPR <vect1, vect2,
5346                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5347                                    6, 7, nelt + j + 2, ...}>  */
5348           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5349           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5350                                            vect2, perm3_mask_high);
5351           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5352           (*result_chain)[j] = data_ref;
5353         }
5354     }
5355   else
5356     {
5357       /* If length is not equal to 3 then only power of 2 is supported.  */
5358       gcc_assert (pow2p_hwi (length));
5359
5360       /* The encoding has 2 interleaved stepped patterns.  */
5361       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5362       vec_perm_builder sel (nelt, 2, 3);
5363       sel.quick_grow (6);
5364       for (i = 0; i < 3; i++)
5365         {
5366           sel[i * 2] = i;
5367           sel[i * 2 + 1] = i + nelt;
5368         }
5369         vec_perm_indices indices (sel, 2, nelt);
5370         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5371
5372         for (i = 0; i < 6; i++)
5373           sel[i] += exact_div (nelt, 2);
5374         indices.new_vector (sel, 2, nelt);
5375         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5376
5377         for (i = 0, n = log_length; i < n; i++)
5378           {
5379             for (j = 0; j < length/2; j++)
5380               {
5381                 vect1 = dr_chain[j];
5382                 vect2 = dr_chain[j+length/2];
5383
5384                 /* Create interleaving stmt:
5385                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5386                                                         ...}>  */
5387                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5388                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5389                                                  vect2, perm_mask_high);
5390                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5391                 (*result_chain)[2*j] = high;
5392
5393                 /* Create interleaving stmt:
5394                    low = VEC_PERM_EXPR <vect1, vect2,
5395                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5396                                          ...}>  */
5397                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5398                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5399                                                  vect2, perm_mask_low);
5400                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5401                 (*result_chain)[2*j+1] = low;
5402               }
5403             memcpy (dr_chain.address (), result_chain->address (),
5404                     length * sizeof (tree));
5405           }
5406     }
5407 }
5408
5409 /* Function vect_setup_realignment
5410
5411    This function is called when vectorizing an unaligned load using
5412    the dr_explicit_realign[_optimized] scheme.
5413    This function generates the following code at the loop prolog:
5414
5415       p = initial_addr;
5416    x  msq_init = *(floor(p));   # prolog load
5417       realignment_token = call target_builtin;
5418     loop:
5419    x  msq = phi (msq_init, ---)
5420
5421    The stmts marked with x are generated only for the case of
5422    dr_explicit_realign_optimized.
5423
5424    The code above sets up a new (vector) pointer, pointing to the first
5425    location accessed by STMT, and a "floor-aligned" load using that pointer.
5426    It also generates code to compute the "realignment-token" (if the relevant
5427    target hook was defined), and creates a phi-node at the loop-header bb
5428    whose arguments are the result of the prolog-load (created by this
5429    function) and the result of a load that takes place in the loop (to be
5430    created by the caller to this function).
5431
5432    For the case of dr_explicit_realign_optimized:
5433    The caller to this function uses the phi-result (msq) to create the
5434    realignment code inside the loop, and sets up the missing phi argument,
5435    as follows:
5436     loop:
5437       msq = phi (msq_init, lsq)
5438       lsq = *(floor(p'));        # load in loop
5439       result = realign_load (msq, lsq, realignment_token);
5440
5441    For the case of dr_explicit_realign:
5442     loop:
5443       msq = *(floor(p));        # load in loop
5444       p' = p + (VS-1);
5445       lsq = *(floor(p'));       # load in loop
5446       result = realign_load (msq, lsq, realignment_token);
5447
5448    Input:
5449    STMT - (scalar) load stmt to be vectorized. This load accesses
5450           a memory location that may be unaligned.
5451    BSI - place where new code is to be inserted.
5452    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5453                               is used.
5454
5455    Output:
5456    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5457                        target hook, if defined.
5458    Return value - the result of the loop-header phi node.  */
5459
5460 tree
5461 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5462                         tree *realignment_token,
5463                         enum dr_alignment_support alignment_support_scheme,
5464                         tree init_addr,
5465                         struct loop **at_loop)
5466 {
5467   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5468   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5469   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5470   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5471   struct loop *loop = NULL;
5472   edge pe = NULL;
5473   tree scalar_dest = gimple_assign_lhs (stmt);
5474   tree vec_dest;
5475   gimple *inc;
5476   tree ptr;
5477   tree data_ref;
5478   basic_block new_bb;
5479   tree msq_init = NULL_TREE;
5480   tree new_temp;
5481   gphi *phi_stmt;
5482   tree msq = NULL_TREE;
5483   gimple_seq stmts = NULL;
5484   bool inv_p;
5485   bool compute_in_loop = false;
5486   bool nested_in_vect_loop = false;
5487   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5488   struct loop *loop_for_initial_load = NULL;
5489
5490   if (loop_vinfo)
5491     {
5492       loop = LOOP_VINFO_LOOP (loop_vinfo);
5493       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5494     }
5495
5496   gcc_assert (alignment_support_scheme == dr_explicit_realign
5497               || alignment_support_scheme == dr_explicit_realign_optimized);
5498
5499   /* We need to generate three things:
5500      1. the misalignment computation
5501      2. the extra vector load (for the optimized realignment scheme).
5502      3. the phi node for the two vectors from which the realignment is
5503       done (for the optimized realignment scheme).  */
5504
5505   /* 1. Determine where to generate the misalignment computation.
5506
5507      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5508      calculation will be generated by this function, outside the loop (in the
5509      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5510      caller, inside the loop.
5511
5512      Background: If the misalignment remains fixed throughout the iterations of
5513      the loop, then both realignment schemes are applicable, and also the
5514      misalignment computation can be done outside LOOP.  This is because we are
5515      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5516      are a multiple of VS (the Vector Size), and therefore the misalignment in
5517      different vectorized LOOP iterations is always the same.
5518      The problem arises only if the memory access is in an inner-loop nested
5519      inside LOOP, which is now being vectorized using outer-loop vectorization.
5520      This is the only case when the misalignment of the memory access may not
5521      remain fixed throughout the iterations of the inner-loop (as explained in
5522      detail in vect_supportable_dr_alignment).  In this case, not only is the
5523      optimized realignment scheme not applicable, but also the misalignment
5524      computation (and generation of the realignment token that is passed to
5525      REALIGN_LOAD) have to be done inside the loop.
5526
5527      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5528      or not, which in turn determines if the misalignment is computed inside
5529      the inner-loop, or outside LOOP.  */
5530
5531   if (init_addr != NULL_TREE || !loop_vinfo)
5532     {
5533       compute_in_loop = true;
5534       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5535     }
5536
5537
5538   /* 2. Determine where to generate the extra vector load.
5539
5540      For the optimized realignment scheme, instead of generating two vector
5541      loads in each iteration, we generate a single extra vector load in the
5542      preheader of the loop, and in each iteration reuse the result of the
5543      vector load from the previous iteration.  In case the memory access is in
5544      an inner-loop nested inside LOOP, which is now being vectorized using
5545      outer-loop vectorization, we need to determine whether this initial vector
5546      load should be generated at the preheader of the inner-loop, or can be
5547      generated at the preheader of LOOP.  If the memory access has no evolution
5548      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5549      to be generated inside LOOP (in the preheader of the inner-loop).  */
5550
5551   if (nested_in_vect_loop)
5552     {
5553       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5554       bool invariant_in_outerloop =
5555             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5556       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5557     }
5558   else
5559     loop_for_initial_load = loop;
5560   if (at_loop)
5561     *at_loop = loop_for_initial_load;
5562
5563   if (loop_for_initial_load)
5564     pe = loop_preheader_edge (loop_for_initial_load);
5565
5566   /* 3. For the case of the optimized realignment, create the first vector
5567       load at the loop preheader.  */
5568
5569   if (alignment_support_scheme == dr_explicit_realign_optimized)
5570     {
5571       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5572       gassign *new_stmt;
5573
5574       gcc_assert (!compute_in_loop);
5575       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5576       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5577                                       NULL_TREE, &init_addr, NULL, &inc,
5578                                       true, &inv_p);
5579       if (TREE_CODE (ptr) == SSA_NAME)
5580         new_temp = copy_ssa_name (ptr);
5581       else
5582         new_temp = make_ssa_name (TREE_TYPE (ptr));
5583       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5584       new_stmt = gimple_build_assign
5585                    (new_temp, BIT_AND_EXPR, ptr,
5586                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5587       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5588       gcc_assert (!new_bb);
5589       data_ref
5590         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5591                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5592       vect_copy_ref_info (data_ref, DR_REF (dr));
5593       new_stmt = gimple_build_assign (vec_dest, data_ref);
5594       new_temp = make_ssa_name (vec_dest, new_stmt);
5595       gimple_assign_set_lhs (new_stmt, new_temp);
5596       if (pe)
5597         {
5598           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5599           gcc_assert (!new_bb);
5600         }
5601       else
5602          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5603
5604       msq_init = gimple_assign_lhs (new_stmt);
5605     }
5606
5607   /* 4. Create realignment token using a target builtin, if available.
5608       It is done either inside the containing loop, or before LOOP (as
5609       determined above).  */
5610
5611   if (targetm.vectorize.builtin_mask_for_load)
5612     {
5613       gcall *new_stmt;
5614       tree builtin_decl;
5615
5616       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5617       if (!init_addr)
5618         {
5619           /* Generate the INIT_ADDR computation outside LOOP.  */
5620           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5621                                                             NULL_TREE);
5622           if (loop)
5623             {
5624               pe = loop_preheader_edge (loop);
5625               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5626               gcc_assert (!new_bb);
5627             }
5628           else
5629              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5630         }
5631
5632       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5633       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5634       vec_dest =
5635         vect_create_destination_var (scalar_dest,
5636                                      gimple_call_return_type (new_stmt));
5637       new_temp = make_ssa_name (vec_dest, new_stmt);
5638       gimple_call_set_lhs (new_stmt, new_temp);
5639
5640       if (compute_in_loop)
5641         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5642       else
5643         {
5644           /* Generate the misalignment computation outside LOOP.  */
5645           pe = loop_preheader_edge (loop);
5646           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5647           gcc_assert (!new_bb);
5648         }
5649
5650       *realignment_token = gimple_call_lhs (new_stmt);
5651
5652       /* The result of the CALL_EXPR to this builtin is determined from
5653          the value of the parameter and no global variables are touched
5654          which makes the builtin a "const" function.  Requiring the
5655          builtin to have the "const" attribute makes it unnecessary
5656          to call mark_call_clobbered.  */
5657       gcc_assert (TREE_READONLY (builtin_decl));
5658     }
5659
5660   if (alignment_support_scheme == dr_explicit_realign)
5661     return msq;
5662
5663   gcc_assert (!compute_in_loop);
5664   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5665
5666
5667   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5668
5669   pe = loop_preheader_edge (containing_loop);
5670   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5671   msq = make_ssa_name (vec_dest);
5672   phi_stmt = create_phi_node (msq, containing_loop->header);
5673   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5674
5675   return msq;
5676 }
5677
5678
5679 /* Function vect_grouped_load_supported.
5680
5681    COUNT is the size of the load group (the number of statements plus the
5682    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5683    only one statement, with a gap of COUNT - 1.
5684
5685    Returns true if a suitable permute exists.  */
5686
5687 bool
5688 vect_grouped_load_supported (tree vectype, bool single_element_p,
5689                              unsigned HOST_WIDE_INT count)
5690 {
5691   machine_mode mode = TYPE_MODE (vectype);
5692
5693   /* If this is single-element interleaving with an element distance
5694      that leaves unused vector loads around punt - we at least create
5695      very sub-optimal code in that case (and blow up memory,
5696      see PR65518).  */
5697   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5698     {
5699       if (dump_enabled_p ())
5700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701                          "single-element interleaving not supported "
5702                          "for not adjacent vector loads\n");
5703       return false;
5704     }
5705
5706   /* vect_permute_load_chain requires the group size to be equal to 3 or
5707      be a power of two.  */
5708   if (count != 3 && exact_log2 (count) == -1)
5709     {
5710       if (dump_enabled_p ())
5711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712                          "the size of the group of accesses"
5713                          " is not a power of 2 or not equal to 3\n");
5714       return false;
5715     }
5716
5717   /* Check that the permutation is supported.  */
5718   if (VECTOR_MODE_P (mode))
5719     {
5720       unsigned int i, j;
5721       if (count == 3)
5722         {
5723           unsigned int nelt;
5724           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5725             {
5726               if (dump_enabled_p ())
5727                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728                                  "cannot handle groups of 3 loads for"
5729                                  " variable-length vectors\n");
5730               return false;
5731             }
5732
5733           vec_perm_builder sel (nelt, nelt, 1);
5734           sel.quick_grow (nelt);
5735           vec_perm_indices indices;
5736           unsigned int k;
5737           for (k = 0; k < 3; k++)
5738             {
5739               for (i = 0; i < nelt; i++)
5740                 if (3 * i + k < 2 * nelt)
5741                   sel[i] = 3 * i + k;
5742                 else
5743                   sel[i] = 0;
5744               indices.new_vector (sel, 2, nelt);
5745               if (!can_vec_perm_const_p (mode, indices))
5746                 {
5747                   if (dump_enabled_p ())
5748                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5749                                      "shuffle of 3 loads is not supported by"
5750                                      " target\n");
5751                   return false;
5752                 }
5753               for (i = 0, j = 0; i < nelt; i++)
5754                 if (3 * i + k < 2 * nelt)
5755                   sel[i] = i;
5756                 else
5757                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5758               indices.new_vector (sel, 2, nelt);
5759               if (!can_vec_perm_const_p (mode, indices))
5760                 {
5761                   if (dump_enabled_p ())
5762                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763                                      "shuffle of 3 loads is not supported by"
5764                                      " target\n");
5765                   return false;
5766                 }
5767             }
5768           return true;
5769         }
5770       else
5771         {
5772           /* If length is not equal to 3 then only power of 2 is supported.  */
5773           gcc_assert (pow2p_hwi (count));
5774           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5775
5776           /* The encoding has a single stepped pattern.  */
5777           vec_perm_builder sel (nelt, 1, 3);
5778           sel.quick_grow (3);
5779           for (i = 0; i < 3; i++)
5780             sel[i] = i * 2;
5781           vec_perm_indices indices (sel, 2, nelt);
5782           if (can_vec_perm_const_p (mode, indices))
5783             {
5784               for (i = 0; i < 3; i++)
5785                 sel[i] = i * 2 + 1;
5786               indices.new_vector (sel, 2, nelt);
5787               if (can_vec_perm_const_p (mode, indices))
5788                 return true;
5789             }
5790         }
5791     }
5792
5793   if (dump_enabled_p ())
5794     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5795                      "extract even/odd not supported by target\n");
5796   return false;
5797 }
5798
5799 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5800    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5801
5802 bool
5803 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5804                            bool masked_p)
5805 {
5806   if (masked_p)
5807     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5808                                          vec_mask_load_lanes_optab,
5809                                          vectype, count);
5810   else
5811     return vect_lanes_optab_supported_p ("vec_load_lanes",
5812                                          vec_load_lanes_optab,
5813                                          vectype, count);
5814 }
5815
5816 /* Function vect_permute_load_chain.
5817
5818    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5819    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5820    the input data correctly.  Return the final references for loads in
5821    RESULT_CHAIN.
5822
5823    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5824    The input is 4 vectors each containing 8 elements. We assign a number to each
5825    element, the input sequence is:
5826
5827    1st vec:   0  1  2  3  4  5  6  7
5828    2nd vec:   8  9 10 11 12 13 14 15
5829    3rd vec:  16 17 18 19 20 21 22 23
5830    4th vec:  24 25 26 27 28 29 30 31
5831
5832    The output sequence should be:
5833
5834    1st vec:  0 4  8 12 16 20 24 28
5835    2nd vec:  1 5  9 13 17 21 25 29
5836    3rd vec:  2 6 10 14 18 22 26 30
5837    4th vec:  3 7 11 15 19 23 27 31
5838
5839    i.e., the first output vector should contain the first elements of each
5840    interleaving group, etc.
5841
5842    We use extract_even/odd instructions to create such output.  The input of
5843    each extract_even/odd operation is two vectors
5844    1st vec    2nd vec
5845    0 1 2 3    4 5 6 7
5846
5847    and the output is the vector of extracted even/odd elements.  The output of
5848    extract_even will be:   0 2 4 6
5849    and of extract_odd:     1 3 5 7
5850
5851
5852    The permutation is done in log LENGTH stages.  In each stage extract_even
5853    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5854    their order.  In our example,
5855
5856    E1: extract_even (1st vec, 2nd vec)
5857    E2: extract_odd (1st vec, 2nd vec)
5858    E3: extract_even (3rd vec, 4th vec)
5859    E4: extract_odd (3rd vec, 4th vec)
5860
5861    The output for the first stage will be:
5862
5863    E1:  0  2  4  6  8 10 12 14
5864    E2:  1  3  5  7  9 11 13 15
5865    E3: 16 18 20 22 24 26 28 30
5866    E4: 17 19 21 23 25 27 29 31
5867
5868    In order to proceed and create the correct sequence for the next stage (or
5869    for the correct output, if the second stage is the last one, as in our
5870    example), we first put the output of extract_even operation and then the
5871    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5872    The input for the second stage is:
5873
5874    1st vec (E1):  0  2  4  6  8 10 12 14
5875    2nd vec (E3): 16 18 20 22 24 26 28 30
5876    3rd vec (E2):  1  3  5  7  9 11 13 15
5877    4th vec (E4): 17 19 21 23 25 27 29 31
5878
5879    The output of the second stage:
5880
5881    E1: 0 4  8 12 16 20 24 28
5882    E2: 2 6 10 14 18 22 26 30
5883    E3: 1 5  9 13 17 21 25 29
5884    E4: 3 7 11 15 19 23 27 31
5885
5886    And RESULT_CHAIN after reordering:
5887
5888    1st vec (E1):  0 4  8 12 16 20 24 28
5889    2nd vec (E3):  1 5  9 13 17 21 25 29
5890    3rd vec (E2):  2 6 10 14 18 22 26 30
5891    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5892
5893 static void
5894 vect_permute_load_chain (vec<tree> dr_chain,
5895                          unsigned int length,
5896                          gimple *stmt,
5897                          gimple_stmt_iterator *gsi,
5898                          vec<tree> *result_chain)
5899 {
5900   tree data_ref, first_vect, second_vect;
5901   tree perm_mask_even, perm_mask_odd;
5902   tree perm3_mask_low, perm3_mask_high;
5903   gimple *perm_stmt;
5904   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5905   unsigned int i, j, log_length = exact_log2 (length);
5906
5907   result_chain->quick_grow (length);
5908   memcpy (result_chain->address (), dr_chain.address (),
5909           length * sizeof (tree));
5910
5911   if (length == 3)
5912     {
5913       /* vect_grouped_load_supported ensures that this is constant.  */
5914       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5915       unsigned int k;
5916
5917       vec_perm_builder sel (nelt, nelt, 1);
5918       sel.quick_grow (nelt);
5919       vec_perm_indices indices;
5920       for (k = 0; k < 3; k++)
5921         {
5922           for (i = 0; i < nelt; i++)
5923             if (3 * i + k < 2 * nelt)
5924               sel[i] = 3 * i + k;
5925             else
5926               sel[i] = 0;
5927           indices.new_vector (sel, 2, nelt);
5928           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5929
5930           for (i = 0, j = 0; i < nelt; i++)
5931             if (3 * i + k < 2 * nelt)
5932               sel[i] = i;
5933             else
5934               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5935           indices.new_vector (sel, 2, nelt);
5936           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5937
5938           first_vect = dr_chain[0];
5939           second_vect = dr_chain[1];
5940
5941           /* Create interleaving stmt (low part of):
5942              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5943                                                              ...}>  */
5944           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5945           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5946                                            second_vect, perm3_mask_low);
5947           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5948
5949           /* Create interleaving stmt (high part of):
5950              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5951                                                               ...}>  */
5952           first_vect = data_ref;
5953           second_vect = dr_chain[2];
5954           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5955           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5956                                            second_vect, perm3_mask_high);
5957           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5958           (*result_chain)[k] = data_ref;
5959         }
5960     }
5961   else
5962     {
5963       /* If length is not equal to 3 then only power of 2 is supported.  */
5964       gcc_assert (pow2p_hwi (length));
5965
5966       /* The encoding has a single stepped pattern.  */
5967       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5968       vec_perm_builder sel (nelt, 1, 3);
5969       sel.quick_grow (3);
5970       for (i = 0; i < 3; ++i)
5971         sel[i] = i * 2;
5972       vec_perm_indices indices (sel, 2, nelt);
5973       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5974
5975       for (i = 0; i < 3; ++i)
5976         sel[i] = i * 2 + 1;
5977       indices.new_vector (sel, 2, nelt);
5978       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5979
5980       for (i = 0; i < log_length; i++)
5981         {
5982           for (j = 0; j < length; j += 2)
5983             {
5984               first_vect = dr_chain[j];
5985               second_vect = dr_chain[j+1];
5986
5987               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5988               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5989               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5990                                                first_vect, second_vect,
5991                                                perm_mask_even);
5992               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5993               (*result_chain)[j/2] = data_ref;
5994
5995               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5996               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5997               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5998                                                first_vect, second_vect,
5999                                                perm_mask_odd);
6000               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6001               (*result_chain)[j/2+length/2] = data_ref;
6002             }
6003           memcpy (dr_chain.address (), result_chain->address (),
6004                   length * sizeof (tree));
6005         }
6006     }
6007 }
6008
6009 /* Function vect_shift_permute_load_chain.
6010
6011    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6012    sequence of stmts to reorder the input data accordingly.
6013    Return the final references for loads in RESULT_CHAIN.
6014    Return true if successed, false otherwise.
6015
6016    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6017    The input is 3 vectors each containing 8 elements.  We assign a
6018    number to each element, the input sequence is:
6019
6020    1st vec:   0  1  2  3  4  5  6  7
6021    2nd vec:   8  9 10 11 12 13 14 15
6022    3rd vec:  16 17 18 19 20 21 22 23
6023
6024    The output sequence should be:
6025
6026    1st vec:  0 3 6  9 12 15 18 21
6027    2nd vec:  1 4 7 10 13 16 19 22
6028    3rd vec:  2 5 8 11 14 17 20 23
6029
6030    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6031
6032    First we shuffle all 3 vectors to get correct elements order:
6033
6034    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6035    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6036    3rd vec:  (16 19 22) (17 20 23) (18 21)
6037
6038    Next we unite and shift vector 3 times:
6039
6040    1st step:
6041      shift right by 6 the concatenation of:
6042      "1st vec" and  "2nd vec"
6043        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6044      "2nd vec" and  "3rd vec"
6045        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6046      "3rd vec" and  "1st vec"
6047        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6048                              | New vectors                   |
6049
6050      So that now new vectors are:
6051
6052      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6053      2nd vec:  (10 13) (16 19 22) (17 20 23)
6054      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6055
6056    2nd step:
6057      shift right by 5 the concatenation of:
6058      "1st vec" and  "3rd vec"
6059        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6060      "2nd vec" and  "1st vec"
6061        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6062      "3rd vec" and  "2nd vec"
6063        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6064                           | New vectors                   |
6065
6066      So that now new vectors are:
6067
6068      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6069      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6070      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6071
6072    3rd step:
6073      shift right by 5 the concatenation of:
6074      "1st vec" and  "1st vec"
6075        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6076      shift right by 3 the concatenation of:
6077      "2nd vec" and  "2nd vec"
6078                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6079                           | New vectors                   |
6080
6081      So that now all vectors are READY:
6082      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6083      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6084      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6085
6086    This algorithm is faster than one in vect_permute_load_chain if:
6087      1.  "shift of a concatination" is faster than general permutation.
6088          This is usually so.
6089      2.  The TARGET machine can't execute vector instructions in parallel.
6090          This is because each step of the algorithm depends on previous.
6091          The algorithm in vect_permute_load_chain is much more parallel.
6092
6093    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6094 */
6095
6096 static bool
6097 vect_shift_permute_load_chain (vec<tree> dr_chain,
6098                                unsigned int length,
6099                                gimple *stmt,
6100                                gimple_stmt_iterator *gsi,
6101                                vec<tree> *result_chain)
6102 {
6103   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6104   tree perm2_mask1, perm2_mask2, perm3_mask;
6105   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6106   gimple *perm_stmt;
6107
6108   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6109   unsigned int i;
6110   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6111   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6112
6113   unsigned HOST_WIDE_INT nelt, vf;
6114   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6115       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6116     /* Not supported for variable-length vectors.  */
6117     return false;
6118
6119   vec_perm_builder sel (nelt, nelt, 1);
6120   sel.quick_grow (nelt);
6121
6122   result_chain->quick_grow (length);
6123   memcpy (result_chain->address (), dr_chain.address (),
6124           length * sizeof (tree));
6125
6126   if (pow2p_hwi (length) && vf > 4)
6127     {
6128       unsigned int j, log_length = exact_log2 (length);
6129       for (i = 0; i < nelt / 2; ++i)
6130         sel[i] = i * 2;
6131       for (i = 0; i < nelt / 2; ++i)
6132         sel[nelt / 2 + i] = i * 2 + 1;
6133       vec_perm_indices indices (sel, 2, nelt);
6134       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6135         {
6136           if (dump_enabled_p ())
6137             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6138                              "shuffle of 2 fields structure is not \
6139                               supported by target\n");
6140           return false;
6141         }
6142       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6143
6144       for (i = 0; i < nelt / 2; ++i)
6145         sel[i] = i * 2 + 1;
6146       for (i = 0; i < nelt / 2; ++i)
6147         sel[nelt / 2 + i] = i * 2;
6148       indices.new_vector (sel, 2, nelt);
6149       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6150         {
6151           if (dump_enabled_p ())
6152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153                              "shuffle of 2 fields structure is not \
6154                               supported by target\n");
6155           return false;
6156         }
6157       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6158
6159       /* Generating permutation constant to shift all elements.
6160          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6161       for (i = 0; i < nelt; i++)
6162         sel[i] = nelt / 2 + i;
6163       indices.new_vector (sel, 2, nelt);
6164       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6165         {
6166           if (dump_enabled_p ())
6167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6168                              "shift permutation is not supported by target\n");
6169           return false;
6170         }
6171       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6172
6173       /* Generating permutation constant to select vector from 2.
6174          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6175       for (i = 0; i < nelt / 2; i++)
6176         sel[i] = i;
6177       for (i = nelt / 2; i < nelt; i++)
6178         sel[i] = nelt + i;
6179       indices.new_vector (sel, 2, nelt);
6180       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6181         {
6182           if (dump_enabled_p ())
6183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6184                              "select is not supported by target\n");
6185           return false;
6186         }
6187       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6188
6189       for (i = 0; i < log_length; i++)
6190         {
6191           for (j = 0; j < length; j += 2)
6192             {
6193               first_vect = dr_chain[j];
6194               second_vect = dr_chain[j + 1];
6195
6196               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6197               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6198                                                first_vect, first_vect,
6199                                                perm2_mask1);
6200               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6201               vect[0] = data_ref;
6202
6203               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6204               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6205                                                second_vect, second_vect,
6206                                                perm2_mask2);
6207               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6208               vect[1] = data_ref;
6209
6210               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6211               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212                                                vect[0], vect[1], shift1_mask);
6213               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6214               (*result_chain)[j/2 + length/2] = data_ref;
6215
6216               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6217               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6218                                                vect[0], vect[1], select_mask);
6219               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6220               (*result_chain)[j/2] = data_ref;
6221             }
6222           memcpy (dr_chain.address (), result_chain->address (),
6223                   length * sizeof (tree));
6224         }
6225       return true;
6226     }
6227   if (length == 3 && vf > 2)
6228     {
6229       unsigned int k = 0, l = 0;
6230
6231       /* Generating permutation constant to get all elements in rigth order.
6232          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6233       for (i = 0; i < nelt; i++)
6234         {
6235           if (3 * k + (l % 3) >= nelt)
6236             {
6237               k = 0;
6238               l += (3 - (nelt % 3));
6239             }
6240           sel[i] = 3 * k + (l % 3);
6241           k++;
6242         }
6243       vec_perm_indices indices (sel, 2, nelt);
6244       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6245         {
6246           if (dump_enabled_p ())
6247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248                              "shuffle of 3 fields structure is not \
6249                               supported by target\n");
6250           return false;
6251         }
6252       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6253
6254       /* Generating permutation constant to shift all elements.
6255          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6256       for (i = 0; i < nelt; i++)
6257         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6258       indices.new_vector (sel, 2, nelt);
6259       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6260         {
6261           if (dump_enabled_p ())
6262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263                              "shift permutation is not supported by target\n");
6264           return false;
6265         }
6266       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6267
6268       /* Generating permutation constant to shift all elements.
6269          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6270       for (i = 0; i < nelt; i++)
6271         sel[i] = 2 * (nelt / 3) + 1 + i;
6272       indices.new_vector (sel, 2, nelt);
6273       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274         {
6275           if (dump_enabled_p ())
6276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277                              "shift permutation is not supported by target\n");
6278           return false;
6279         }
6280       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6281
6282       /* Generating permutation constant to shift all elements.
6283          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6284       for (i = 0; i < nelt; i++)
6285         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6286       indices.new_vector (sel, 2, nelt);
6287       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6288         {
6289           if (dump_enabled_p ())
6290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6291                              "shift permutation is not supported by target\n");
6292           return false;
6293         }
6294       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6295
6296       /* Generating permutation constant to shift all elements.
6297          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6298       for (i = 0; i < nelt; i++)
6299         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6300       indices.new_vector (sel, 2, nelt);
6301       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6302         {
6303           if (dump_enabled_p ())
6304             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305                              "shift permutation is not supported by target\n");
6306           return false;
6307         }
6308       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6309
6310       for (k = 0; k < 3; k++)
6311         {
6312           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6313           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6314                                            dr_chain[k], dr_chain[k],
6315                                            perm3_mask);
6316           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317           vect[k] = data_ref;
6318         }
6319
6320       for (k = 0; k < 3; k++)
6321         {
6322           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6323           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6324                                            vect[k % 3], vect[(k + 1) % 3],
6325                                            shift1_mask);
6326           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6327           vect_shift[k] = data_ref;
6328         }
6329
6330       for (k = 0; k < 3; k++)
6331         {
6332           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6333           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6334                                            vect_shift[(4 - k) % 3],
6335                                            vect_shift[(3 - k) % 3],
6336                                            shift2_mask);
6337           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6338           vect[k] = data_ref;
6339         }
6340
6341       (*result_chain)[3 - (nelt % 3)] = vect[2];
6342
6343       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6344       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6345                                        vect[0], shift3_mask);
6346       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6347       (*result_chain)[nelt % 3] = data_ref;
6348
6349       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6350       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6351                                        vect[1], shift4_mask);
6352       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6353       (*result_chain)[0] = data_ref;
6354       return true;
6355     }
6356   return false;
6357 }
6358
6359 /* Function vect_transform_grouped_load.
6360
6361    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6362    to perform their permutation and ascribe the result vectorized statements to
6363    the scalar statements.
6364 */
6365
6366 void
6367 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6368                              gimple_stmt_iterator *gsi)
6369 {
6370   machine_mode mode;
6371   vec<tree> result_chain = vNULL;
6372
6373   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6374      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6375      vectors, that are ready for vector computation.  */
6376   result_chain.create (size);
6377
6378   /* If reassociation width for vector type is 2 or greater target machine can
6379      execute 2 or more vector instructions in parallel.  Otherwise try to
6380      get chain for loads group using vect_shift_permute_load_chain.  */
6381   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6382   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6383       || pow2p_hwi (size)
6384       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6385                                          gsi, &result_chain))
6386     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6387   vect_record_grouped_load_vectors (stmt, result_chain);
6388   result_chain.release ();
6389 }
6390
6391 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6392    generated as part of the vectorization of STMT.  Assign the statement
6393    for each vector to the associated scalar statement.  */
6394
6395 void
6396 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6397 {
6398   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6399   gimple *next_stmt, *new_stmt;
6400   unsigned int i, gap_count;
6401   tree tmp_data_ref;
6402
6403   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6404      Since we scan the chain starting from it's first node, their order
6405      corresponds the order of data-refs in RESULT_CHAIN.  */
6406   next_stmt = first_stmt;
6407   gap_count = 1;
6408   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6409     {
6410       if (!next_stmt)
6411         break;
6412
6413       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6414        code elimination pass later.  No need to check for the first stmt in
6415        the group, since it always exists.
6416        GROUP_GAP is the number of steps in elements from the previous
6417        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6418        correspond to the gaps.  */
6419       if (next_stmt != first_stmt
6420           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6421       {
6422         gap_count++;
6423         continue;
6424       }
6425
6426       while (next_stmt)
6427         {
6428           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6429           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6430              copies, and we put the new vector statement in the first available
6431              RELATED_STMT.  */
6432           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6433             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6434           else
6435             {
6436               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6437                 {
6438                   gimple *prev_stmt =
6439                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6440                   gimple *rel_stmt =
6441                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6442                   while (rel_stmt)
6443                     {
6444                       prev_stmt = rel_stmt;
6445                       rel_stmt =
6446                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6447                     }
6448
6449                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6450                     new_stmt;
6451                 }
6452             }
6453
6454           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6455           gap_count = 1;
6456           /* If NEXT_STMT accesses the same DR as the previous statement,
6457              put the same TMP_DATA_REF as its vectorized statement; otherwise
6458              get the next data-ref from RESULT_CHAIN.  */
6459           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6460             break;
6461         }
6462     }
6463 }
6464
6465 /* Function vect_force_dr_alignment_p.
6466
6467    Returns whether the alignment of a DECL can be forced to be aligned
6468    on ALIGNMENT bit boundary.  */
6469
6470 bool
6471 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6472 {
6473   if (!VAR_P (decl))
6474     return false;
6475
6476   if (decl_in_symtab_p (decl)
6477       && !symtab_node::get (decl)->can_increase_alignment_p ())
6478     return false;
6479
6480   if (TREE_STATIC (decl))
6481     return (alignment <= MAX_OFILE_ALIGNMENT);
6482   else
6483     return (alignment <= MAX_STACK_ALIGNMENT);
6484 }
6485
6486
6487 /* Return whether the data reference DR is supported with respect to its
6488    alignment.
6489    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6490    it is aligned, i.e., check if it is possible to vectorize it with different
6491    alignment.  */
6492
6493 enum dr_alignment_support
6494 vect_supportable_dr_alignment (struct data_reference *dr,
6495                                bool check_aligned_accesses)
6496 {
6497   gimple *stmt = DR_STMT (dr);
6498   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6499   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6500   machine_mode mode = TYPE_MODE (vectype);
6501   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6502   struct loop *vect_loop = NULL;
6503   bool nested_in_vect_loop = false;
6504
6505   if (aligned_access_p (dr) && !check_aligned_accesses)
6506     return dr_aligned;
6507
6508   /* For now assume all conditional loads/stores support unaligned
6509      access without any special code.  */
6510   if (is_gimple_call (stmt)
6511       && gimple_call_internal_p (stmt)
6512       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6513           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6514     return dr_unaligned_supported;
6515
6516   if (loop_vinfo)
6517     {
6518       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6519       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6520     }
6521
6522   /* Possibly unaligned access.  */
6523
6524   /* We can choose between using the implicit realignment scheme (generating
6525      a misaligned_move stmt) and the explicit realignment scheme (generating
6526      aligned loads with a REALIGN_LOAD).  There are two variants to the
6527      explicit realignment scheme: optimized, and unoptimized.
6528      We can optimize the realignment only if the step between consecutive
6529      vector loads is equal to the vector size.  Since the vector memory
6530      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6531      is guaranteed that the misalignment amount remains the same throughout the
6532      execution of the vectorized loop.  Therefore, we can create the
6533      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6534      at the loop preheader.
6535
6536      However, in the case of outer-loop vectorization, when vectorizing a
6537      memory access in the inner-loop nested within the LOOP that is now being
6538      vectorized, while it is guaranteed that the misalignment of the
6539      vectorized memory access will remain the same in different outer-loop
6540      iterations, it is *not* guaranteed that is will remain the same throughout
6541      the execution of the inner-loop.  This is because the inner-loop advances
6542      with the original scalar step (and not in steps of VS).  If the inner-loop
6543      step happens to be a multiple of VS, then the misalignment remains fixed
6544      and we can use the optimized realignment scheme.  For example:
6545
6546       for (i=0; i<N; i++)
6547         for (j=0; j<M; j++)
6548           s += a[i+j];
6549
6550      When vectorizing the i-loop in the above example, the step between
6551      consecutive vector loads is 1, and so the misalignment does not remain
6552      fixed across the execution of the inner-loop, and the realignment cannot
6553      be optimized (as illustrated in the following pseudo vectorized loop):
6554
6555       for (i=0; i<N; i+=4)
6556         for (j=0; j<M; j++){
6557           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6558                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6559                          // (assuming that we start from an aligned address).
6560           }
6561
6562      We therefore have to use the unoptimized realignment scheme:
6563
6564       for (i=0; i<N; i+=4)
6565           for (j=k; j<M; j+=4)
6566           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6567                            // that the misalignment of the initial address is
6568                            // 0).
6569
6570      The loop can then be vectorized as follows:
6571
6572       for (k=0; k<4; k++){
6573         rt = get_realignment_token (&vp[k]);
6574         for (i=0; i<N; i+=4){
6575           v1 = vp[i+k];
6576           for (j=k; j<M; j+=4){
6577             v2 = vp[i+j+VS-1];
6578             va = REALIGN_LOAD <v1,v2,rt>;
6579             vs += va;
6580             v1 = v2;
6581           }
6582         }
6583     } */
6584
6585   if (DR_IS_READ (dr))
6586     {
6587       bool is_packed = false;
6588       tree type = (TREE_TYPE (DR_REF (dr)));
6589
6590       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6591           && (!targetm.vectorize.builtin_mask_for_load
6592               || targetm.vectorize.builtin_mask_for_load ()))
6593         {
6594           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6595
6596           /* If we are doing SLP then the accesses need not have the
6597              same alignment, instead it depends on the SLP group size.  */
6598           if (loop_vinfo
6599               && STMT_SLP_TYPE (stmt_info)
6600               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6601                               * GROUP_SIZE (vinfo_for_stmt
6602                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6603                               TYPE_VECTOR_SUBPARTS (vectype)))
6604             ;
6605           else if (!loop_vinfo
6606                    || (nested_in_vect_loop
6607                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6608                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6609             return dr_explicit_realign;
6610           else
6611             return dr_explicit_realign_optimized;
6612         }
6613       if (!known_alignment_for_access_p (dr))
6614         is_packed = not_size_aligned (DR_REF (dr));
6615
6616       if (targetm.vectorize.support_vector_misalignment
6617             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6618         /* Can't software pipeline the loads, but can at least do them.  */
6619         return dr_unaligned_supported;
6620     }
6621   else
6622     {
6623       bool is_packed = false;
6624       tree type = (TREE_TYPE (DR_REF (dr)));
6625
6626       if (!known_alignment_for_access_p (dr))
6627         is_packed = not_size_aligned (DR_REF (dr));
6628
6629      if (targetm.vectorize.support_vector_misalignment
6630            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6631        return dr_unaligned_supported;
6632     }
6633
6634   /* Unsupported.  */
6635   return dr_unaligned_unsupported;
6636 }